[Python] 纯文本查看 复制代码
import re
import queue
import time
import requests
import re
import chardet
# 提取网页代码链接,并存入相应的字典或队列
def extract_url(text=None):
global url_main
global url_head
global n_sum
global w_sum
global n_fp
global w_fp
if text is None:
return {"status": False, "code": "传入参数为None"}
else:
try:
# 提取页面所有链接
text = re.findall("<a href=['\"](.+?)[\"']", text, re.S)
except:
return {"status": False, "code": "正则解析错误"}
# 如果提取成功
if text != []:
for i in text:
# 判断是否为合法的url
is_url = is_web(i)
# is_url函数会判断是否为https:www.baidu.com,./book/one.html
if is_url["status"]:
# print(is_url["code"])
if i[0] == 'h':
# 获取其主域名
temp = get_urlmain(i)
if temp["status"]:
temp = temp["code"]
else:
print(temp["code"])
# 如果提取到的链接主域名与爬取网站的主域名一样,则判定为内链
if url_main == temp:
if n_sum.get(i) == None:
n_sum[i] = i
n_fp.write(i + "\n")
print(i)
# 如果不一样则判定为外链
if url_main != temp:
if w_sum.get(i) == None:
w_sum[i] = i
w_fp.write(i + "\n")
print(i)
else:
# 如果是javascript这种的is_web函数会返回错误
## 所以只存在./book/one.html这种情况
if n_sum.get(i) == None:
n_sum[i] = url_head + url_main + i
n_buffer.put(url_head + url_main + i)
n_fp.write(url_head + url_main + i + "\n")
print("组合内链", url_head + url_main + i)
print(url_head, url_main, i, is_url["code"], is_url["code"][0])
print(url_head, url_main, "内链", len(n_sum), "外链", len(w_sum))
print("内链缓冲区域", n_buffer.qsize())
# while n_buffer.empty() == False:
# print(n_buffer.get())
# 提取url主域名
def get_urlmain(url=None):
if url is None:
return {"status": False, "code": "传入参数为None"}
try:
url_main = re.search(r"((\w+?\.\w+?$)?(\w+?\.\w+?)$|(\w+?\.\w+?)[/'\"])", url, re.S)
# 我也知道为啥,最后的结果总是带/,干脆直接替换掉完事
url_main = re.sub("/", "", url_main.group(0))
return {"status": True, "code": url_main}
except:
# print(url)
return {"status": False, "code": "正则提取失败!"}
# 判断url是否合法
def is_web(url=None):
if url is None:
return {"status": False, "code": "传入参数为None"}
# 必须先判断http网址的,./类型的判断http也能匹配的上,懒得改格式了
## 改了一下判断顺序拉倒
receive = re.search(r"http[s]{0,1}:.+", url)
if receive != None:
return {"status": True, "code": receive.group(0)}
receive = re.search(r"[\./]{1,3}.+", url)
if receive != None:
return {"status": True, "code": receive.group(0)}
return {"status": False, "code": "不是合法的URL"}
if __name__ == "__main__":
# 定义总的内链和外链集合
## 内链字典
n_sum = {}
## 外链字典
w_sum = {}
## 内链缓存队列
n_buffer = queue.Queue()
## 内链异常队列
n_abnormal = queue.Queue()
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"
}
date = time.strftime('%Y-%m-%d %H-%M-%S', time.localtime())
# 文本写入路径,一般来说会写入到当前目录下
n_fp = open("./n&" + date + ".txt", "w+", encoding="utf-8")
w_fp = open("./w&" + date + ".txt", "w+", encoding="utf-8")
# 需要爬取的网站
url = "https://www.52pojie.cn/"
n_buffer.put(url)
# 什么队列为空什么时候停止
while n_buffer.empty() == False:
url = n_buffer.get()
url_main = get_urlmain(url)
if url_main["status"]:
url_main = url_main["code"]
else:
print(url_main["code"])
url_head = re.search(r"https?://(www)?\.?", url)
url_head = url_head.group(0)
try:
html = requests.get(url, headers=headers, timeout=5, verify=False)
except:
print("可能是访问出错了", url)
print("内链", len(n_sum), "外链", len(w_sum))
print("内链缓冲区域", n_buffer.qsize())
# print(html.text)
extract_url(html.text)
# time.sleep(0.5)
n_fp.close()
w_fp.close()