python3爬虫爬取百度贴吧帖子图片
本帖最后由 书写情书 于 2017-7-28 09:17 编辑因为刚开始写注释不是很清楚怎么写就没写注释
用了 request 和 BeautifulSoup 爬虫框架
灵感是来源于 某课堂的一个爬虫教程
```
import requests
import re
from threading import Thread
from bs4 import BeautifulSoup
import os
class Pictures(object):
def __init__(self, url):
self.url = url
def main(self):
req = requests.get(self.url)
req.encoding = 'utf-8'
soup = BeautifulSoup(req.text, 'html.parser')
self.get_img_url(soup)
t1 = Thread(target=self.thread, args={0, 11})
t2 = Thread(target=self.thread, args={11, 22})
t1.start()
t2.start()
def thread(self, a, b):
for i in range(a, b):
url = 'https://tieba.baidu.com/p/5245797575?pn=%d' % i
print('第%d页' % i)
sp = self.url_open(url)
self.get_img_url(sp)
def get_img_url(self, soup):
for img in soup.select('.BDE_Image'):
src = re.search(r'https://.+?jpg', str(img)).group()
file_name = re.search('sign=.+/(.+jpg)', str(src)).group(1)
# print(src, file_name)
self.sava_img(src, file_name)
def sava_img(self, url, file_name):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
req = requests.get(url, headers=headers).content
with open(file_name, 'wb') as f:
print("正在下载", file_name)
f.write(req)
def url_open(self, url):
req = requests.get(url)
req.encoding = 'utf-8'
soup = BeautifulSoup(req.text, 'html.parser')
return soup
if __name__ == "__main__":
# 爬取图片的网页url
url = 'https://tieba.baidu.com/p/5245797575'
if not os.path.exists('classimg'):
os.mkdir('classimg')
os.chdir('classimg')
pictures = Pictures(url)
pictures.main()
``` 源码是经过修改才发出来的 和图里的源码不一致 但是下载方式都是一样的 先收藏。 感谢分享。 批量保存会用到 收藏起来,谢谢楼主分享 收藏了,谢谢楼主 这个牛呀,学习学习 好好的学习了 感谢分享。 先收藏了再说谢谢楼主
页:
[1]
2