from bs4 import BeautifulSoup
from requests.exceptions import ProxyError
from urllib.parse import urlparse
import cchardet
import json
import os
import random
import re
import requests
import sys
import time
proxies = {"http": "http://127.0.0.1:8087",
"https": "http://127.0.0.1:8087"}
#从一个首页开始,不断寻找下一页,下载文本,写入文件
class ChainUrl():
def __init__(self, url,fileName,useProxies=False):
self.url=url
self.fileName=fileName
self.useProxies=useProxies
self.tryTimes=0
self.patternDict={}
self.patternFileName=''
self.headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
self.proxies={"http": "http://127.0.0.1:8087","https": "http://127.0.0.1:8087"}
if self.useProxies:
self.proxies=proxies
self.initPatternDict()
# 获取网页内容解析
def getHtmlBs(self):
if not self.url:
return None
# 一个页面,最多重试20次
while self.tryTimes<=20:
try:
if self.useProxies:
html=requests.get(self.url,headers=self.headers,proxies=self.proxies)
else:
html=requests.get(self.url,headers=self.headers)
#如果网页的编码是空,探测网页的编码,编码保存在self.patterDict中,并更新网页类型字典文件
if not self.patternDict['encoding']:
self.detectEndcoing(html.content)
self.updatePatternDictFile()
html.encoding=self.encoding
bs=BeautifulSoup(html.text,features='html.parser')
bs.prettify()
return bs
except ProxyError as e:
print(e)
time.sleep(2000)
self.tryTimes+=1
except Exception as e:
print("连接超时。可能的原因:1、网络不通\n 2、访问该网站需要使用代{过}{滤}理服务器\n")
print(e)
break
return None
# 获取网页内容,保存到文件中
def parseHtml(self,bs):
self.tryTimes=0
text=""
if 'article' in self.patternDict.keys():
for param in self.patternDict['article'].keys():
elem=self.patternDict['article'][param]['tag_name']
attrs=self.patternDict['article'][param]['attrs']
text_source=bs.find(elem,attrs=attrs)
text+=str(text_source)
#把多余的字符去掉
#去掉tagname及其属性
subtext='<'+elem
str_attrsDict=''
for k,v in attrs.items():
str_attrsDict+=k+'='+'"'+v+'"'
subtext=re.compile(subtext+'\s+'+str_attrsDict+'^[>]*>')
text=re.sub(subtext,'',text)
text=text.replace('<p>','\n').replace('</p>','').replace('<br/>','\n').replace('</div>','\n')
text=re.sub(re.compile('</'+elem+'^[>]>'),'',text)
self.saveToFile(text)
# 获取下一页
def getNextPage(self,bs):
print(self.url)
urlPart=urlparse(self.url)
nextPage=bs.find('a',href=True,text='下一页')
if not nextPage:
nextPage=bs.find('a',href=True,text='下一章')
if nextPage:
if 'href' in nextPage.attrs:
time.sleep(random.randint(0,10))
nextPageUrl=str(nextPage.attrs['href'])
if self.is_absolute(nextPageUrl):
self.url=str(nextPage.attrs['href'])
else:
self.url=str(urlPart.scheme)+"://"+str(urlPart.netloc)+str(nextPage.attrs['href'])
else:
self.url=None
def detectEndcoing(self,content):
detectResultDict=cchardet.detect(content)
if detectResultDict['confidence'] >0.7:
self.encoding = cchardet.detect(content)['encoding']
self.patternDict['encoding']=self.encoding
else:
print('检测网页编码可能出错,请手工设置网页编码\n')
sys.exit(1)
def saveToFile(self,text):
with open(self.fileName,'a',encoding=self.encoding,) as file:
file.write(text)
def is_absolute(self,url):
return bool(urlparse(url).netloc)
#更新网页字典文件
def updatePatternDictFile(self):
with open(urlparse(self.url).netloc,'w') as file:
file.write(json.dumps(self.patternDict)+"\n")
#初始化网页内容类型词典
def initPatternDict(self):
self.patternFileName=urlparse(self.url).netloc
if os.path.exists(self.patternFileName):
with open(self.patternFileName) as patternFile:
try:
line=patternFile.read()
self.patternDict=json.loads(line)
self.encoding=self.patternDict['encoding']
except Exception as e:
print('读取网页内容类型字典文件出错,请检查词典文件!')
sys.exit()
else:
print("必须手工设置文档类型词典!")
print("保证下面article中是要选取的内容")
self.patternDict={
#网页的编码
'encoding':'',
'article':{
#如果文件中只有content,就只写content,如果有title,就添加title项
'title':{'tag_name':'h1','attrs':{'id':'timu'}},
'content':{'tag_name':'div','attrs':{'id':'contentbox'}}
}
}
#把网页内容类型词典保存到文件中
def updateDictFile(self):
with open(self.patternFileName,'w') as patternFile:
patternFile.write(json.dumps(self.patternDict))
def run(self):
# 网页链接不为空,就循环
while True:
bs=self.getHtmlBs()
if not bs:
break
self.parseHtml(bs)
self.getNextPage(bs)
if __name__=='__main__':
encoding=''
useProxies=False
params_len=len(sys.argv)
if params_len<3:
# 输入参数: 首页,保存的文件名,网页编码,是否使用代{过}{滤}理服务器
print('用法:getTextFromUrlChain startUrl savedFileName.txt encoding useProxies')
elif params_len>=5:
encoding=sys.argv[3]
useProxies=sys.argv[4]
elif params_len>=4:
encoding=sys.argv[3]
startUrl=sys.argv[1]
fileName=sys.argv[2]
textUrl=ChainUrl(startUrl,fileName,useProxies)
textUrl.run()