今天试着爬www.win4000.com的图，明明返回数据了，却下不下来，是不是有什么反爬了

cjs25 · 发表于 2020-3-31 11:46

本帖最后由 cjs25 于 2020-3-31 11:51 编辑

[Python] 纯文本查看 复制代码

#http://www.win4000.com/mt/star_0_2_2.html
#html/body/div[4]/div/div[4]/div[2]/div/div/ul/li[1]/a
#/html/body/div[4]/div/div[2]/div/div[1]/div[1]/em
#/html/body/div[4]/div/div[3]/div[1]/div[1]/div[2]/div/div/ul/li[1]/a
import requests, os, time, random
from lxml import etree


def ranheader():
    user1 = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60'
    user2 = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0'
    user3 = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2'
    #user5 = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'
    user4 = 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'
   # user6 = 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
   # user7 = 'Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
    #user8 = 'Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'
    list1 = [user1, user2, user3, user4]
    agent = random.choice(list1)
    header = {'User-Agent': agent}
    return header
def mulu(filename):
    file_path =filename
    if not os.path.exists(file_path):
        os.makedirs(file_path)
        print(file_path + '创建成功')
    else:
        print(file_path + "已存在")

    os.chdir(file_path)
def get_urllist(url):  # 访问网址
    req = requests.get(url, headers=ranheader()).content
    # html=etree.HTML(req.content.decode('utf-8'))
    return req
def down_file(addr,filename):
    print("正在打开"+filename)
    try:
        imgs_file=requests.get(addr,headers=ranheader())
        time.sleep(2)
        print(imgs_file)
       # if imgs_file == "<Response [200]>" :
        with open(filename, 'wb+') as f:
            f.write(imgs_file.content)
    except:
        print("文件保存错误")
def xeixi(persone):                                   #获得每页明星的网址，返回明星的名字跟志址用来创建明星目录
    html1 = etree.HTML(persone.decode())
    addlist1=html1.xpath('/html/body/div[4]/div/div[4]/div[2]/div/div/ul/li[1]/a/@href')       #取得每个明星的地址
    filename1=html1.xpath('/html/body/div[4]/div/div[4]/div[2]/div/div/ul/li[1]/a/img/@alt')           #取得每个明星的名字
 #   return (addlist1,filename1)
#def mingxinmulu(addlist,filename):                   #a获得每个明星有多少个子目录，跟地址．
    for i in range(0,len(filename1)):
        os.chdir("f:/美图")
        mulu(filename1[i])               #创建每个明星目录
        html2=etree.HTML(get_urllist(addlist1[i]).decode())
        addlist2=html2.xpath('/html/body/div[4]/div/div[3]/div[1]/div[1]/div[2]/div/div/ul/li[1]/a/@href')    #取得各个明星子目录的地址
        filename2=html2.xpath('/html/body/div[4]/div/div[3]/div[1]/div[1]/div[2]/div/div/ul/li[1]/a/img/@alt')#取得各个明星子目录的名称
        for k in range(0,len(addlist2)):
             mulu(filename2[k])
             html3=etree.HTML(get_urllist(addlist2[k]).decode())
             id3 = html3.xpath('/html/body/div[4]/div/div[2]/div/div[1]/div[1]/em/text()')
             img1=html3.xpath('/html/body/div[4]/div/div[2]/div/div[1]/div[2]/div[2]/a/@href')
             img1_file=img1[0].split("?")[0]
             filename3=str("1")+".jpg"
             print(id3[0])
             print(img1_file)
             down_file(img1_file,filename3)
          

             for n in range(2,int(id3[0])+1):

                 filename4 = str(n) + ".jpg"
                 cc=addlist2[k].split(".")[0]
                 img_url=cc+"_"+str(n)+"html"
                 html4=etree(get_urllist(img_url).decode())

                 img_addr=html4.xpath('/html/body/div[4]/div/div[2]/div/div[1]/div[2]/div[2]/a/@href')[0]
                 img_addr1=img_addr.split("?")[0]

                 down_file(img_addr,filename4)
def get_url(i, k):
    list1 = []
    for x in range(i, k + 1):
        #// *[ [url=home.php?mod=space&uid=402414]@[/url] id = "pic-meinv"] / a / img
        list1.append('http://www.win4000.com/mt/star_0_2_' + str(i) + ".html")
    return list1
if __name__ == '__main__':
    url='http: // www.win4000.com / mt / star_0_2_'
    st_page = input("请输入要开始的页数：")
    en_page = input("请输入要结束的页数：")
    file_path1 = "f:/美图"
    if not os.path.exists(file_path1):
        os.makedirs(file_path1)
        print(file_path1 + '创建成功')
    else:
        print(file_path1 + "已存在")

    os.chdir(file_path1)
    url_list = get_url(int(st_page), int(en_page))
    for tt in range(0, len(url_list)):
        xeixi(get_urllist(url_list[tt]))

lntuer · 发表于 2020-3-31 12:00

你请求头啥都没加，能对吗？

null119 · 发表于 2020-3-31 12:21

代码看着真累，header加Accept、Host后试试

天黑我隐身 · 发表于 2020-3-31 12:38

怎么找到的?对抛出的异常多看几眼
requests.exceptions.ConnectionError: HTTPConnectionPool(host='www_2.html', port=80)
一看就知道请求的url不对劲
学会使用IDE的断点调试，不行那么print慢慢定位错误代码也可以
这些简单问题其实都可以自己解决，真的没必要求助

jidesheng6 · 发表于 2020-3-31 12:48

你写的这个代码真心累，写单个网址测试看看先

ycy0536 · 发表于 2020-3-31 12:57

表示看不懂

帐号		自动登录	找回密码
密码			注册[Register]

[求助] 今天试着爬www.win4000.com的图，明明返回数据了，却下不下来，是不是有什么反爬了

浏览过的版块