救救孩子吧,写了一下午没写出来,老哥们帮忙改一下啊
本帖最后由 应真先生 于 2019-7-19 17:50 编辑今天看论坛里一个老哥用urllib库写了个爬虫,我想试着把它改一下,改成requests的,结果写了一天,一直报错,老哥们帮忙看看问题出在哪里。
import urllib.request
import re
import requests
from requests import RequestException
import os
from hashlib import md5
'''
for a in range(0,10):
fl_url = "https://818ps.com/search/0-0-0-0-0-null-31_1_0-0-0/" + str(a) + ".html?route_id=15634482253883&route=1,3,&after_route=1,3"
data = urllib.request.urlopen(fl_url).read().decode("utf-8","ignore")
tj = "/detail/(.*?).html"
p = re.compile(tj).findall(data)
p = list(set(p))
try:
for i in range(len(p)):
free = str(p)
new_url = "https://818ps.com/detail/" + free + ".html"
new_data = urllib.request.urlopen(new_url).read().decode("utf-8","ignore")
new_tj = 'https://img.tuguaishou.com/ips_templ_preview/(.*?)"'
q = re.compile(new_tj).findall(new_data)
for j in range(len(q)):
tup_url = "https://img.tuguaishou.com/ips_templ_preview/" + str(q)
file = "F:/bing/a/" + str(a) +str(i) + str(j) + ".jpg"
print("正在下载编号:" + str(a) + str(i) + str(j))
urllib.request.urlretrieve(tup_url,filename=file)
print("下载完成")
except urllib.error.URLError as e:
if hasattr(e, 'code'):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
'''
def get_page(offset):
base_url = 'https://818ps.com/search/0-0-0-0-0-null-31_1_0-0-0/'+ str(offset)
url = base_url+'.html?route_id=15634484581189&route=3,&after_route=3'
response = requests.get(url)
try:
if response.status_code == 200:
return response.text
except RequestException:
return None
def parse_url(html):
pattern = re.compile('<a.*?open-detail.*?href="(.*?)"', re.S)
items = re.findall(pattern, html)
urls = []
for url in items:
urls.append(url)
return urls
def save_image(urls):
for i in range(len(urls)):
resp = requests.get('https://818ps.com'+ urls)
text = resp.text
try:
if resp.status_code == 200:
pattern = re.compile('src="(.*?)"\salt.*title="(.*?)">',re.S)
results = re.findall(pattern, text)
for result in results:
print(result.group,result.group)
download = requests.get(result.group)
file_path = '{file_name}.{file_suffix}'.format(
file_name=result.group,
file_suffix='png')
if not os.path.exists(file_path):
with open(file_path, 'wb')as f:
f.write(download.content)
print('Downloaded image path is %s' % file_path)
else:
print('Already Downloaded', file_path)
except Exception as e:
print(e)
offset = 1
def main():
html = get_page(1)
urls = parse_url(html)
print(urls)
save_image(urls)
if __name__ == '__main__':
main()
已找到问题出在哪里,并且把一楼老哥的代码改了一下,加入多线程,保存在img文件夹里用网页标题显示
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
import requests
import os
from multiprocessing import Pool
def get_page(offset):
fl_url = "https://818ps.com/search/0-0-0-0-0-null-31_1_0-0-0/" + str(offset) + ".html?route_id=15634482253883&route=1,3,&after_route=1,3"
data = requests.get(fl_url).text
tj = "/detail/(.*?).html"
p = re.compile(tj).findall(data)
p = list(set(p))
return p
def save_page(p):
try:
for i in p:
# free = str(i)
new_url = "https://818ps.com/detail/" + str(i) + ".html"
new_data = requests.get(new_url).text
new_tj = '//img.tuguaishou.com/ips_templ_preview/(.*?)"\salt.*title="(.*?)"/>'
q = re.compile(new_tj).findall(new_data)
for j, p in q:
tup_url = "https://img.tuguaishou.com/ips_templ_preview/" + str(j)
# file = "./bing/a/" + str(a) +str(i) + str(j) + ".jpg"
p = re.sub('/', '_',str(p))
print("正在下载编号:" + p)
img_path = 'img'
if not os.path.exists(img_path):
os.makedirs(img_path)
imagetemp = requests.get(tup_url).content
file_path =img_path + os.path.sep + p + ".jpg"
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(imagetemp)
print("下载完成")
else:
print('已经下载', file_path)
except requests.exceptions.InvalidURL as e:
if hasattr(e, 'code'):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
def main(offset):
p = get_page(offset)
save_page(p)
group_start = 1#起始页数
group_end = 10 #结束页数
if __name__ == '__main__':
pool = Pool()
groups = ()
pool.map(main,groups)
pool.close()
pool.join() 本帖最后由 ixsec 于 2019-7-19 02:50 编辑
网站 有限制,你下载图片哪个连接 其实是 正常的网址并非是图片的url~~
http://img.tuguaishou.com/ips_templ_preview/10/cd/36/lg_2006799_1562406570_5d206eaa8fc3c.jpg!w1024_w?auth_key=2195922752-0-0-c503d6c4bc4d4d300e9961fdb9981463
就如这个一样 这个其实是网页url 并非图片的 url
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#! author = ixsec
#! date : 2019/7/19
#! filename : 818ps.py
# MIT License
#
# Copyright (c) 2019 ixsec
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import re
import requests
from requests import RequestException
import os
from hashlib import md5
for a in range(0,10):
fl_url = "https://818ps.com/search/0-0-0-0-0-null-31_1_0-0-0/" + str(a) + ".html?route_id=15634482253883&route=1,3,&after_route=1,3"
data = requests.get(fl_url).text
tj = "/detail/(.*?).html"
p = re.compile(tj).findall(data)
# p = list(set(p))
try:
for i in p:
# free = str(i)
new_url = "https://818ps.com/detail/" + str(i) + ".html"
new_data = requests.get(new_url).text
new_tj = '//img.tuguaishou.com/ips_templ_preview/(.*?)"'
q = re.compile(new_tj).findall(new_data)
for j in q:
tup_url = "https://img.tuguaishou.com/ips_templ_preview/" + str(j)
# file = "./bing/a/" + str(a) +str(i) + str(j) + ".jpg"
print("正在下载编号:" + str(a) + str(i) + str(j))
imagetemp = requests.get(tup_url).content
with open("./" + str(a) +str(i) + str(j) + ".jpg", 'wb') as f:
f.write(imagetemp)
print("下载完成")
except requests.exceptions.InvalidURL as e:
if hasattr(e, 'code'):
print(e.code)
if hasattr(e, "reason"):
print(e.reason) 哥们说详细点啊,看不出来你要表达意思 你这孩子没救了 有人解答了,我就不献丑了:victory: 我就是来摸鱼的 一楼是正解! 论坛里果然大神无数,而且热心助人,仰望{:1_921:} ixsec 发表于 2019-7-19 02:48
网站 有限制,你下载图片哪个连接 其实是 正常的网址并非是图片的url~~
!w1024_w?auth_key=2195922752-0 ...
老哥这个是论坛里某个老哥写的,我备注了,我想问一下下面我写的那个问题出在哪里?
页:
[1]
2