import sys
import requests
import os
import time
import base64
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def send_query(query_keyword):
base64_query_keyword = base64.standard_b64encode(query_keyword)
page = 1
uri = build_fofa_query_uri(base64_query_keyword, page)
print uri
def build_fofa_query_uri(base64_query_keyword, page):
base_uri = 'https://fofa.so'
uri_parameter_pattern = '/result?page='+str(page)+'&qbase64='+ base64_query_keyword + '&full=true'
uri = base_uri + uri_parameter_pattern
return uri
def wait_for_ajax_loading(expected_condition, driver):
element = None
try:
element = WebDriverWait(driver, 600).until(expected_condition)
except Exception,e:
print "等待 list_mod 超时,退出浏览器。"
driver.quit()
def fofa_login(driver):
username = os.environ['FOFA_USERNAME']
password = os.environ['FOFA_PASSWORD']
username_input = driver.find_element_by_id('username')
pwd_input = driver.find_element_by_id('password')
login_btn = driver.find_element_by_xpath('//*[@id="login-form"]/table/tbody/tr[4]/td/button')
username_input.send_keys(username)
time.sleep(1)
pwd_input.send_keys(password)
login_btn.submit()
def extract_result_number_from_list_jg(list_jg):
pre_idx = list_jg.contents[2].find('获得'.decode('utf-8'))+3
suf_idx = list_jg.contents[2].find('条匹'.decode('utf-8'))
number = list_jg.contents[2][pre_idx:suf_idx].strip()
print "结果数量:", number
return number
def single_page_extract(divs, page):
page_items = []
for idx, div in enumerate(divs):
print '编号:', idx + (page-1)*10
print 'URL:', div.div.a.contents[0].strip()
list_mod_c = div.find('div', class_='list_mod_c')
item = extract_list_mod_c(list_mod_c)
item['URL'] = div.div.a.contents[0].strip()
page_items.append(item)
return page_items
def next_page_extract(driver, page):
next_page_button = driver.find_element_by_class_name('next_page')
next_page_button.click()
wait_for_ajax_loading(EC.presence_of_element_located((By.CLASS_NAME, "list_mod")), driver)
time.sleep(2)
whole_page = driver.find_element_by_xpath('/html')
page_html = whole_page.get_attribute("outerHTML")
soup = BeautifulSoup(page_html, 'html.parser')
divs = soup.find_all("div", class_="list_mod")
return single_page_extract(divs, page)
def page_items_process(page_items, f):
for item in page_items:
f.write(str(item['IP'])+'\n')
def webdriver_action():
output_path = 'F420_list.txt'
webdriver_path = 'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe'
driver = webdriver.Chrome(webdriver_path)
driver.get('https://i.nosec.org/login?service=http%3A%2F%2Ffofa.so%2Fusers%2Fservice')
fofa_login(driver)
search_box = driver.find_element_by_id('q')
search_box.send_keys('title="F420"'.decode('utf-8'))
time.sleep(1)
search_btn = driver.find_element_by_xpath('//*[@id="search_form"]/input[2]')
search_btn.click()
wait_for_ajax_loading(EC.presence_of_element_located((By.CLASS_NAME, "list_mod")), driver)
whole_page = driver.find_element_by_xpath('/html')
page_html = whole_page.get_attribute("outerHTML")
soup = BeautifulSoup(page_html, 'html.parser')
list_jg = soup.find('div', class_='list_jg')
extract_result_number_from_list_jg(list_jg)
page = 1
with open(output_path,'w') as f:
divs = soup.find_all("div", class_="list_mod")
page_items = single_page_extract(divs, page)
page_items_process(page_items,f)
for page in range(2,111):
page_items = next_page_extract(driver, page)
page_items_process(page_items, f)
time.sleep(5)
f.close()
def extract_list_mod_c_2(list_mod_c):
lis_iter = list_mod_c.find_all('li')
lis = []
for li in lis_iter:
lis.append(li)
try:
data_title = lis[0].contents[1]
except Exception,e:
print e
data_title = 'N/A'
try:
data_os = lis[1].contents[1].string
except Exception,e:
print e
data_os = 'N/A'
try:
data_ip = lis[2].contents[2].string
except Exception,e:
print e
data_ip = 'N/A'
try:
data_date = lis[3].contents[1]
except Exception,e:
print e
data_date = 'N/A'
try:
data_country = lis[4].contents[2].a.string
except Exception,e:
print e
data_country = 'N/A'
try:
data_city = lis[4].contents[2].a.find_next_sibling('a').string
except Exception,e:
print e
data_city = 'N/A'
try:
data_domain = lis[5].contents[2].string
except Exception,e:
print e
data_domain = 'N/A'
print 'Title:', data_title
print 'OS:', data_os
print 'IP:', data_ip
print 'Date:', data_date
print 'Country:', data_country
print 'City:', data_city
print 'Domain:', data_domain
def extract_list_mod_c(list_mod_c):
lis_iter = list_mod_c.find_all('li')
result = {
'Title':'N/A',
'OS':'N/A',
'IP':'N/A',
'Date':'N/A',
'Country':'N/A',
'City':'N/A',
'Domain':'N/A',
}
for idx, li in enumerate(lis_iter):
if idx == 0:
result['Title'] = li.contents[1].strip()
else:
if li.i is not None:
if 'fa-cog' in li.i.attrs['class']:
result['OS'] = li.a.string
elif 'fa-map-marker' in li.i.attrs['class']:
result['IP'] = li.a.string
elif 'fa-clock-o' in li.i.attrs['class']:
result['Date'] = li.contents[1].strip()
elif 'fa-plane' in li.i.attrs['class']:
if li.a is None:
result['Country'] = 'N/A'
result['City'] = 'N/A'
else:
result['Country'] = li.a.string
result['City'] = li.a.find_next_sibling('a').string
elif 'fa-leaf' in li.i.attrs['class']:
result['Domain'] = li.a.string
else:
pass
else:
pass
print 'Title:', result['Title']
print 'OS:', result['OS']
print 'IP:', result['IP']
print 'Date:', result['Date']
print 'Country:', result['Country']
print 'City:', result['City']
print 'Domain:', result['Domain']
return result
if __name__ == '__main__':
webdriver_action()