[Python] 纯文本查看 复制代码
# -*- coding:utf-8 -*-
import re
import os
import requests
import yagmail
import urllib3
import logging
from concurrent.futures import ThreadPoolExecutor
import time
import threading
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import xlwt
import xlrd
import socket
from xlutils.copy import copy
#获取状态码、标题
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
}
def get_ip(url):
url = url.strip('\n').replace('http://','')
myaddr = socket.getaddrinfo(url, 'http')
return myaddr[0][4][0]
def get_codetitle(url):
code = "无法访问"
title = " "
resurl = " "
try:
urllib3.disable_warnings()
requests.adapters.DEFAULT_RETRIES = 5
res = requests.get(url, headers=header, verify=False, allow_redirects=True, timeout=(3,12))
res.encoding = res.apparent_encoding
code = res.status_code
title = re.findall(r"(?<=\<title\>)(?:.|\n)+?(?=\<)", res.text, re.IGNORECASE)[0].strip()
description = re.findall(r"(?<=\<meta name=\"description\" content=\")(?:.|\n)+?(?=\" \/\>|\"\/\>)", res.text, re.IGNORECASE)[0].strip()
keywords = re.findall(r"(?<=\<meta name=\"keywords\" content=\")(?:.|\n)+?(?=\" \/\>|\"\/\>)", res.text, re.IGNORECASE)[0].strip()
resurl = res.url
except Exception as error:
print('%s网址无效或者IP被封锁'%(url))
try:
ip = get_ip(url)
except:
ip = 'null'
return resurl,code,title,description,keywords,ip
def write(url):
codetitle = get_codetitle(url)
resurl=str(codetitle[0])
code=str(codetitle[1])
title=str(codetitle[2])
description=str(codetitle[3])
keywords=str(codetitle[4])
ip=str(codetitle[5])
print(url+ "|" +code+ "|" +title+ "|" +ip)
with lock:
word_book = xlrd.open_workbook(path+savefilename+'.xls')
sheets = word_book.sheet_names()
work_sheet = word_book.sheet_by_name(sheets[0])
old_rows = work_sheet.nrows
heads = work_sheet.row_values(0)
new_work_book = copy(word_book)
new_sheet = new_work_book.get_sheet(0)
i = old_rows
new_sheet.write(i, 0, url)
new_sheet.write(i, 1, resurl)
new_sheet.write(i, 2, code)
new_sheet.write(i, 3, title)
new_sheet.write(i, 4, description)
new_sheet.write(i, 5, keywords)
new_sheet.write(i, 6, ip)
new_work_book.save(path + savefilename+'.xls')
n = 0
path = os.getcwd()
logging.captureWarnings(True)
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
start = time.time()
lock = threading.Lock()
savefilename = time.strftime("%Y-%m-%d %H.%M.%S")
myxls=xlwt.Workbook()
sheet1=myxls.add_sheet(u'title',cell_overwrite_ok=True)
sheet1.write(0,0,"源地址")
sheet1.write(0,1,"跳转地址")
sheet1.write(0,2,"状态码")
sheet1.write(0,3,"标题")
sheet1.write(0,4,"描述")
sheet1.write(0,5,"关键词")
sheet1.write(0,6,"IP")
myxls.save(path + savefilename+'.xls')
#url.txt中ip:port格式转换成http、https格式,保存到url-run.txt中
with open(path + "\\url.txt","r") as f:
line = f.readlines()
with open(path + "\\url-run.txt","w") as f2:
for i in line:
i=i.strip('\n')
if 'http://' not in i:
f2.write('http://'+i+'\n')
else:
f2.write(i+'\n')
#获取url列表
with open(path + '\\url-run.txt', 'r', encoding='utf-8') as f:
urls_data = [data.strip().strip('\\') for data in f]
#多线程
with ThreadPoolExecutor(max_workers=100) as executor:
for urls in urls_data:
executor.submit(
write, url=urls
)
end = time.time()
print("总耗时:",end - start,"秒")
# 发送邮箱提醒
try:
yag = yagmail.SMTP(user="xxxxxx@163.com", password="密码你的", host='smtp.163.com')#smtp的邮箱和密码
contents = ['TDK获取时间:%s'%(end - start)] #主要内容
DDOSD_Sender = f'TDK获取完成通知' #标题
receiver =["xxxx@qq.com"] # 接受的邮箱 多个邮箱
yag.send(receiver, DDOSD_Sender, contents) #提交发送
yag.close() #结束进程
except:
print('smtp 同个时间发送超过10条或者过期')