救救孩子吧，写了一下午没写出来，老哥们帮忙改一下啊

应真先生 发表于 2019-7-19 01:13

本帖最后由应真先生于 2019-7-19 17:50 编辑

今天看论坛里一个老哥用urllib库写了个爬虫，我想试着把它改一下，改成requests的，结果写了一天，一直报错，老哥们帮忙看看问题出在哪里。
import urllib.request
import re
import requests
from requests import RequestException
import os
from hashlib import md5
'''
for a in range(0,10):
fl_url = "https://818ps.com/search/0-0-0-0-0-null-31_1_0-0-0/" + str(a) + ".html?route_id=15634482253883&route=1,3,&after_route=1,3"
data = urllib.request.urlopen(fl_url).read().decode("utf-8","ignore")
tj = "/detail/(.*?).html"
p = re.compile(tj).findall(data)
p = list(set(p))
try:
   for i in range(len(p)):
         free = str(p)
         new_url = "https://818ps.com/detail/" + free + ".html"
         new_data = urllib.request.urlopen(new_url).read().decode("utf-8","ignore")
         new_tj = 'https://img.tuguaishou.com/ips_templ_preview/(.*?)"'
         q = re.compile(new_tj).findall(new_data)
         for j in range(len(q)):
            tup_url = "https://img.tuguaishou.com/ips_templ_preview/" + str(q)
            file = "F:/bing/a/" + str(a) +str(i) + str(j) + ".jpg"
            print("正在下载编号:" + str(a) + str(i) + str(j))
            urllib.request.urlretrieve(tup_url,filename=file)
            print("下载完成")
except urllib.error.URLError as e:
         if hasattr(e, 'code'):
            print(e.code)
         if hasattr(e, "reason"):
            print(e.reason)
'''

def get_page(offset):
base_url = 'https://818ps.com/search/0-0-0-0-0-null-31_1_0-0-0/'+ str(offset)
url = base_url+'.html?route_id=15634484581189&route=3,&after_route=3'
response = requests.get(url)
try:
   if response.status_code == 200:
         return response.text
except RequestException:
   return None

def parse_url(html):
   pattern = re.compile('<a.*?open-detail.*?href="(.*?)"', re.S)
   items = re.findall(pattern, html)
   urls = []
   for url in items:
         urls.append(url)
   return urls

def save_image(urls):
for i in range(len(urls)):
   resp = requests.get('https://818ps.com'+ urls)
   text = resp.text
   try:
         if resp.status_code == 200:
            pattern = re.compile('src="(.*?)"\salt.*title="(.*?)">',re.S)
            results = re.findall(pattern, text)
            for result in results:
               print(result.group,result.group)
               download = requests.get(result.group)
               file_path = '{file_name}.{file_suffix}'.format(
                     file_name=result.group,
                     file_suffix='png')
               if not os.path.exists(file_path):
                     with open(file_path, 'wb')as f:
                        f.write(download.content)
                     print('Downloaded image path is %s' % file_path)
               else:
                     print('Already Downloaded', file_path)
   except Exception as e:
         print(e)

offset = 1
def main():
html = get_page(1)
urls = parse_url(html)
print(urls)
save_image(urls)

if __name__ == '__main__':
main()

已找到问题出在哪里，并且把一楼老哥的代码改了一下，加入多线程，保存在img文件夹里用网页标题显示
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
import requests
import os
from multiprocessing import Pool

def get_page(offset):
fl_url = "https://818ps.com/search/0-0-0-0-0-null-31_1_0-0-0/" + str(offset) + ".html?route_id=15634482253883&route=1,3,&after_route=1,3"
data = requests.get(fl_url).text
tj = "/detail/(.*?).html"
p = re.compile(tj).findall(data)
p = list(set(p))
return p

def save_page(p):
try:
   for i in p:
         # free = str(i)
         new_url = "https://818ps.com/detail/" + str(i) + ".html"
         new_data = requests.get(new_url).text
         new_tj = '//img.tuguaishou.com/ips_templ_preview/(.*?)"\salt.*title="(.*?)"/>'
         q = re.compile(new_tj).findall(new_data)
         for j, p in q:
            tup_url = "https://img.tuguaishou.com/ips_templ_preview/" + str(j)
            # file = "./bing/a/" + str(a) +str(i) + str(j) + ".jpg"
            p = re.sub('/', '_',str(p))
            print("正在下载编号:" + p)
            img_path = 'img'
            if not os.path.exists(img_path):
               os.makedirs(img_path)
            imagetemp = requests.get(tup_url).content
            file_path =img_path + os.path.sep + p + ".jpg"
            if not os.path.exists(file_path):
               with open(file_path, 'wb') as f:
                     f.write(imagetemp)
               print("下载完成")
            else:
                     print('已经下载', file_path)
except requests.exceptions.InvalidURL as e:
   if hasattr(e, 'code'):
         print(e.code)
   if hasattr(e, "reason"):
         print(e.reason)

def main(offset):
p = get_page(offset)
save_page(p)

group_start = 1#起始页数
group_end = 10 #结束页数
if __name__ == '__main__':
pool = Pool()
groups = ()
pool.map(main,groups)
pool.close()
pool.join()

ixsec 发表于 2019-7-19 02:48

本帖最后由 ixsec 于 2019-7-19 02:50 编辑

网站有限制，你下载图片哪个连接其实是正常的网址并非是图片的url~~

http://img.tuguaishou.com/ips_templ_preview/10/cd/36/lg_2006799_1562406570_5d206eaa8fc3c.jpg!w1024_w?auth_key=2195922752-0-0-c503d6c4bc4d4d300e9961fdb9981463

就如这个一样这个其实是网页url 并非图片的 url

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#! author = ixsec
#! date : 2019/7/19
#! filename : 818ps.py
# MIT License
#
# Copyright (c) 2019 ixsec
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import re
import requests
from requests import RequestException
import os
from hashlib import md5

for a in range(0,10):
fl_url = "https://818ps.com/search/0-0-0-0-0-null-31_1_0-0-0/" + str(a) + ".html?route_id=15634482253883&route=1,3,&after_route=1,3"
data = requests.get(fl_url).text
tj = "/detail/(.*?).html"
p = re.compile(tj).findall(data)
# p = list(set(p))
try:
   for i in p:
         # free = str(i)
         new_url = "https://818ps.com/detail/" + str(i) + ".html"
         new_data = requests.get(new_url).text
         new_tj = '//img.tuguaishou.com/ips_templ_preview/(.*?)"'
         q = re.compile(new_tj).findall(new_data)
         for j in q:
            tup_url = "https://img.tuguaishou.com/ips_templ_preview/" + str(j)
            # file = "./bing/a/" + str(a) +str(i) + str(j) + ".jpg"
            print("正在下载编号:" + str(a) + str(i) + str(j))
            imagetemp = requests.get(tup_url).content
            with open("./" + str(a) +str(i) + str(j) + ".jpg", 'wb') as f:
               f.write(imagetemp)
            print("下载完成")
except requests.exceptions.InvalidURL as e:
         if hasattr(e, 'code'):
            print(e.code)
         if hasattr(e, "reason"):
            print(e.reason)

成国大吉大利 发表于 2019-7-19 03:04

哥们说详细点啊，看不出来你要表达意思

龙云666 发表于 2019-7-19 08:41

你这孩子没救了

haoii123 发表于 2019-7-19 08:47

有人解答了，我就不献丑了:victory:

mspe 发表于 2019-7-19 08:52

我就是来摸鱼的

baduxue 发表于 2019-7-19 09:05

一楼是正解！

waddy 发表于 2019-7-19 09:17

论坛里果然大神无数，而且热心助人，仰望{:1_921:}

应真先生 发表于 2019-7-19 10:50

ixsec 发表于 2019-7-19 02:48
网站有限制，你下载图片哪个连接其实是正常的网址并非是图片的url~~

!w1024_w?auth_key=2195922752-0 ...

老哥这个是论坛里某个老哥写的，我备注了，我想问一下下面我写的那个问题出在哪里？

zenaiwen 发表于 2019-7-19 11:24

页: [1] 2

吾爱破解 - 52pojie.cn's Archiver

救救孩子吧，写了一下午没写出来，老哥们帮忙改一下啊