from bs4 import BeautifulSoup
from urllib.parse import urljoin
from urllib.parse import urlparse
import queue
import random
import re
import requests
import sys
import time
class Tianya:
def __init__(self,url):
self.url=url
self.content=""
self.hostId=""
self.title=""
self.encoding=""
self.headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"}
self.urlPart=urlparse(self.url)
self.nextPageQueue = queue.Queue()
def getNextPage(self):
soup=self.getPageSourceSoup()
print(self.url)
# 获取网页出现异常 退出程序
if not soup:
return
#如果没有设置hostId,获取主贴作者的hostId,获取主贴的标题
if not self.hostId:
script_tags=soup.find_all('script')
#print(script_tags[0])
regex_pattern=re.compile('authorId\s+\:\s+"\d+"')
authorid_tags=re.findall(regex_pattern,str(script_tags[0]))
authorid=re.findall(re.compile('\d+'),authorid_tags[0])[0]
self.hostId=authorid
self.title=soup.title.text
#获取帖子中作者的帖子内容
content=self.getContent(soup)
self.saveNovelFile(content)
#获取下一页的链接,如果下一页链接存在,继续访问下一页
nextPage_tags=soup.find_all('a',attrs={'class':'js-keyboard-next'})
if len(nextPage_tags)>=1:
time.sleep(random.randint(7,22))
if 'href' in nextPage_tags[0].attrs:
self.url=urljoin(self.url,nextPage_tags[0].attrs['href'])
return True
else:
return False
def getPageSourceSoup(self):
try:
res=requests.get(self.url,headers=self.headers)
res.encoding=res.apparent_encoding
self.encoding=res.encoding
soup=BeautifulSoup(res.text,'html.parser')
return soup
except Exception as e:
print(e.message)
return None
def getContent(self,soup):
content=""
content_tags=soup.find_all('div',attrs={"_hostid":self.hostId})
for div in content_tags:
content_tag=div.find('div',attrs={'class':'bbs-content'})
content+=str(content_tag)
#获取帖子中的图片链接,这种方式有问题,获取到的图片只是天涯网站图片正在加载的图片链接
#似应该用EC等方法判断全文加载完毕后再行分析图片地址
# 不要获取img中src中的链接,该链接为天涯网站图片加载图片的链接,而img中original中的链接为图片的真实地址
img_tags=content_tag.find_all('img')
for img_tag in img_tags:
if 'original' in img_tag.attrs:
imgUrl=urljoin(self.url,img_tag.attrs['original'])
self.downloadImg(imgUrl)
return content
#下载帖子中的图片
def downloadImg(self,imgUrl):
imgFileName=imgUrl[imgUrl.rindex('/')+1:]
with open(imgFileName,'wb') as img_file:
try:
res = requests.get(imgUrl, stream=True,headers=self.headers)
for chunk in res.iter_content(chunk_size=1024):
if chunk:
img_file.write(chunk)
#downlodedLength+=len(chunk)
##print('\r已下载:{:.2%} {:,} KB'.format(downlodedLength/videoLength,downlodedLength/1024),end='\r')
except ConnectionError as e:
print(e)
#把文本内容保存到文件中
def saveNovelFile(self,content):
div_tag_pattern=re.compile('<div.*?[^>]*>')
div_tags=re.findall(div_tag_pattern,content)
for div_tag in div_tags:
content=content.replace(div_tag,'')
space_tag_pattern=re.compile('\s{3,}')
space_tags=re.findall(space_tag_pattern,content)
for space_tag in space_tags:
content=content.replace(space_tag,' ')
content=content.replace('<br/>','\r\n').replace('</div>','')
with open(self.title+".txt",'a',encoding=self.encoding) as f:
f.write(content)
def run(self):
while True:
if not self.getNextPage():
break
if __name__=='__main__':
if len(sys.argv)<2:
print('用法: python tianyaText.py page_url')
print('page_url: 帖子首页地址')
else:
url=sys.argv[1]
tianya=Tianya(url)
tianya.run()
|