本帖最后由 应真先生 于 2019-7-19 17:50 编辑
今天看论坛里一个老哥用urllib库写了个爬虫,我想试着把它改一下,改成requests的,结果写了一天,一直报错,老哥们帮忙看看问题出在哪里。
[Python] 纯文本查看 复制代码 import urllib.request
import re
import requests
from requests import RequestException
import os
from hashlib import md5
'''
for a in range(0,10):
fl_url = "https://818ps.com/search/0-0-0-0-0-null-31_1_0-0-0/" + str(a) + ".html?route_id=15634482253883&route=1,3,&after_route=1,3"
data = urllib.request.urlopen(fl_url).read().decode("utf-8","ignore")
tj = "/detail/(.*?).html"
p = re.compile(tj).findall(data)
p = list(set(p))
try:
for i in range(len(p)):
free = str(p)
new_url = "https://818ps.com/detail/" + free + ".html"
new_data = urllib.request.urlopen(new_url).read().decode("utf-8","ignore")
new_tj = 'https://img.tuguaishou.com/ips_templ_preview/(.*?)"'
q = re.compile(new_tj).findall(new_data)
for j in range(len(q)):
tup_url = "https://img.tuguaishou.com/ips_templ_preview/" + str(q[j])
file = "F:/bing/a/" + str(a) +str(i) + str(j) + ".jpg"
print("正在下载编号:" + str(a) + str(i) + str(j))
urllib.request.urlretrieve(tup_url,filename=file)
print("下载完成")
except urllib.error.URLError as e:
if hasattr(e, 'code'):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
'''
def get_page(offset):
base_url = 'https://818ps.com/search/0-0-0-0-0-null-31_1_0-0-0/'+ str(offset)
url = base_url+'.html?route_id=15634484581189&route=3,&after_route=3'
response = requests.get(url)
try:
if response.status_code == 200:
return response.text
except RequestException:
return None
def parse_url(html):
pattern = re.compile('<a.*?open-detail.*?href="(.*?)"', re.S)
items = re.findall(pattern, html)
urls = []
for url in items:
urls.append(url)
return urls
def save_image(urls):
for i in range(len(urls)):
resp = requests.get('https://818ps.com'+ urls[i])
text = resp.text
try:
if resp.status_code == 200:
pattern = re.compile('src="(.*?)"\salt.*title="(.*?)">',re.S)
results = re.findall(pattern, text)
for result in results:
print(result.group[1],result.group[2])
download = requests.get(result.group[1])
file_path = '{file_name}.{file_suffix}'.format(
file_name=result.group[2],
file_suffix='png')
if not os.path.exists(file_path):
with open(file_path, 'wb')as f:
f.write(download.content)
print('Downloaded image path is %s' % file_path)
else:
print('Already Downloaded', file_path)
except Exception as e:
print(e)
offset = 1
def main():
html = get_page(1)
urls = parse_url(html)
print(urls)
save_image(urls)
if __name__ == '__main__':
main()
已找到问题出在哪里,并且把一楼老哥的代码改了一下,加入多线程,保存在img文件夹里用网页标题显示
[Python] 纯文本查看 复制代码 #!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
import requests
import os
from multiprocessing import Pool
def get_page(offset):
fl_url = "https://818ps.com/search/0-0-0-0-0-null-31_1_0-0-0/" + str(offset) + ".html?route_id=15634482253883&route=1,3,&after_route=1,3"
data = requests.get(fl_url).text
tj = "/detail/(.*?).html"
p = re.compile(tj).findall(data)
p = list(set(p))
return p
def save_page(p):
try:
for i in p:
# free = str(i)
new_url = "https://818ps.com/detail/" + str(i) + ".html"
new_data = requests.get(new_url).text
new_tj = '//img.tuguaishou.com/ips_templ_preview/(.*?)"\salt.*title="(.*?)"/>'
q = re.compile(new_tj).findall(new_data)
for j, p in q:
tup_url = "https://img.tuguaishou.com/ips_templ_preview/" + str(j)
# file = "./bing/a/" + str(a) +str(i) + str(j) + ".jpg"
p = re.sub('/', '_',str(p))
print("正在下载编号:" + p)
img_path = 'img'
if not os.path.exists(img_path):
os.makedirs(img_path)
imagetemp = requests.get(tup_url).content
file_path = img_path + os.path.sep + p + ".jpg"
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(imagetemp)
print("下载完成")
else:
print('已经下载', file_path)
except requests.exceptions.InvalidURL as e:
if hasattr(e, 'code'):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
def main(offset):
p = get_page(offset)
save_page(p)
group_start = 1 #起始页数
group_end = 10 #结束页数
if __name__ == '__main__':
pool = Pool()
groups = ([x * 1 for x in range(group_start, group_end+1)])
pool.map(main,groups)
pool.close()
pool.join() |