huayue 发表于 2020-3-21 21:12

求个大佬帮忙解决一下爬虫问题,翻遍了全网也不知道哪里错了

import openpyxl
import requests
from bs4 import BeautifulSoup
import openpyxl
import json
wb = openpyxl.Workbook()
sheet = wb.active
sheet.title ='new title'
content=[]
url = ' http://www.syfc.js.cn/datacall/licence.aspx'
headers = {
'User-Agent':'***'
}
data={
'__EVENTTARGET': 'GV_List',
'__EVENTARGUMENT': 'Page$1',
'__VIEWSTATE':'GeCL0Anw+Qsit1tt1ti8qEfPxQwng5Mm4UITzf5QTJFrX11Fo35kt2GR1yaPOsGAKGSgZmFRj+oVUNkB+KNEfnGjli4I16RGK1Af7hgGAiNYin02ZZHaIdB3M+K+XPc6ipP+KqwnDqhHEtdFhILD4bKdu/jM7Me0KMWwo+98hXQ3qxc0TGG1W2AE4JlBo/OGX3OeJhGxx7iJ0gs9LifI0QfR8rU1WdA5RPE8tE4QgGiohx6tXCTepAvnVvrHV1Mi//KZVVTRtq1NSHggO6eDcV3ELzhcRsUgdH1iYlrCQguKH8BQKzdNe4Bq2R8A0xX0fEnm897ELXrDXrEBvcBKonyrwpmWMFCM7XWGM2u2FC3thTAtoJ9xZ6Tc2dcf712zFfB523Jza4BwxA+NJisRO9YQYtw5y0zBhOm+FO6JNwN4tv5QEwlgxxfQL/tZGdjwI2YjQmnOxwE8d5nWpkQJjYgoDXPUBuw06Mobk2X9p+dYb1+9bV39lG+a1klYcje8pIjz3mHMrWPAX8FHuY15hbL2UlGzXSeVmQ2fbZIrzz1mPi456TL1wBtTcsLEPgoz8cxeJUgd0OaPVzpvbfZ0PfgVr4PF5RVG1v5ToKLn9xgNx6aW8UWVDKGcIIzp7HcXKmwm58KVq+TYmMU5DaVCwclL6HKKVK6MhqbvdbGY9bupEAtelF1qQhVvaAJYa6K82bJ+ZkiG8vz9cmeDKjN8FCBdJOH/ZO33Qq9DXoGqCHTAbzIlgjJa8QaMOrYw8s/nXwYMLz1zHXlapx/EqmwD+6uzptNQszaFExAzLZaIQxmzggW5DcKxKtmabRwUZ/9VmPiJq25EO1UiQdrHi0Um9MU0+TwVD3rFOouubwyroAP2efRVZIbaCdhdEIGCPeLDVbBGMTpP9dlgfiSBDfkTZXp2yzZGzsin5rFrAJsXvSvb7jp3qLKBwqeObXYAEU+wvRfnU7kt/2xO0XFkJ4pTWpcafg3YZcCnTk4nLhRc2AzFDViLetq4qJSGpaTG4K/Lv5nhxVUa3IUqMbmlszApGsJdOXkOqiDS/bzTbQFltEyMbe+4Sp72IFP8QPaIhX+OTeVLIBNW3dRObKaVvqB6TraNIQBjwWzxSA/CDIbEvvDHzch3qv+yUmHF9gN1bqZguQX5RoD1uRdLwTjUUDPD8GmHLyrxjhxiNamp2eMlp7y4xPv9LEMuqsezBEpjEFiDnzsS9bW42puD3nz/od21A8QuzF4/y3s0TxGB33mzTBQhjCQ31oyxQegW41gX6qeA3hNlmqqHIcaRUiRJ5wypfCe9JKS5DbMHopd2b5C2GlmXGtNIBMuPo0diLRmgHA3EHksD6SIYUY/Uh91wS2kLqV8RvioNOF+sLLrtKPdtfAajJgN85I5OgVBiUBk1Bvp+YBJs3XkZMpte2zYFJwSEXPROs63SK4flXiC/9CVfXg2SnuhKVvPjDXgmhKMotu46iWuprvWzcbEZK0QUZiUVtyN/E0FIA2iSjpJj2SUw3KJVgvRTGN/Hz7A9g2l8B0aykoD4wh2Wotsnv3kB/qH0u7UB6s4HucUzOiZyMsauQLgfLsjW3ctduJ1saFqJ6OwcxvF3RtMPRr/KU5fW6HgSGgYTWxDwgMCfJ36aWIhxZnCwp5QPDD3IiKsjykPeZl0AG23vvVuvibRHgkHxbBx0BPEPoCnbYzIKmfgTavb6Cpups/mfon6tnCS9sxX/4s4HActKLEpxi+4vyx5gIy0uQ7tlH0JAPjSR2CVC0jxOxPLS/IQ4N6en/+NDf8MRkXQ8UihX5jDIsSvB47KOnx9QrWRGWn6HUAwlYT3AlVRy3ucQdy6Nc84mXo5G+P5+y/NE9oGuvoUqh3neWE1pGjiKpX/lzL+UxeMKX2PpsSqYaZQsDIqohzZTHzPiJK8Zw0UEl05G18jgooLO0Q0P7twscWzTmTzyZuasLxF6Es+4d2w183VG8AWRNjnua+PpKr2wp47rnh9+T96HWEBTvhFZ0CbabXqN4MDodzhZ6rtRwk65kIaPMmiki4mM+zVlJAcq3Uiq+YOfuaqtSPD77Zg4HSQKj4XfHhFbrPQ3i/CqblDzZrNJLPtrcoHawEDHVyyUjLZuXg9muWgFCNIq8yulgJzqb7IUl1mFrL7flvQo9HjqmeccIE2xjn3bE4jwfrHeD6s+s/X3XTN53schy3JoOFprqtQu3Rf2k7PLn3zoCg8y3prsHhDq0YWWdlulOj+GLXfr+3MqBtq6QNYL7B3ZqjEHK3yBskB+gJBNqD/D1DzGt6WsVm4aVBztzHbUPBk+rncm+CSNA5PhMp/WO06KDgTOlbSqnSA5Ik+RRnfxYX+K48EFPQXuh5o+12Fi+roMdyhxjEokCFu9FirUTJxGp2AMZEvpxadnH5QUruPpzsI0yIe6eKXlhl3k2eBUUIfiu6qsMCNsNoVVS43o/2tj726y3cCR1b7N0K9vcfp0cykg/afQzOtjkpM0tZFAvczjNYRjSsn3pjfPqdOJ5IWMuMaf/l1ex+OTLpl7KV/UGfaStOR573GnuSozuhANqKn3jwiTVDblt0qVw/lZ+XnBBzPvulxEukmUj0n5Sk2KQMWczx4r3PFWh7hGxEORuVXS4nmXuBAGLaejGnnvGP/ooqQ0eL+7NmaiyntiXgYh2OOkMGofi06uAOx/YALvVi/9YplyvtSXf4o6xcMBsZehpCRwAAuRAVQzgxwY1Pgzw9ERah4byHORCP3pMeZL5Lg415Q6rIJcw3510j24CE4bOOUkXFZPu8I/uWXj2ALgZ5jWSDsR7cLfCffFJR8x+q2kyFTBL9L+mDkhVlJGOsXxEAoJAcklSOGO1setlCs0KADxZCHCxa6c1OUMPSFKR393g3APJVy1l6KmaMOvNnSjBmqqF/4bVBRm2SbZwsD64nel0supLsdQY3vGWSBkFZshtUzprfdQa8Eusl7HGikX8xgs9LiCFkPzjolQuaxNWgKko393oin0GP71abQRoX6aXsVlNrOcckFakKUMTe9CAiGaJ0ExVwfQvvdLZjw5zBETUH2Vk6LYoMX0ni0SA3xOImR2pXlD6uLGqqPeILA7iRCEDdODRdeEjDKPFftpTO39NSZQQEuZWYHeZfFJPHL3eGWXpqcii9ahD3nWuku3jgE7JRE8p5G4Ivent7GLqdZxXEviaUKXrPzXYlmpiqbv/62mcWr24LfnHCBEEkXAjjGD6R+wk67GkWl6xOsumHFmcomofFAU9jGyZ+dniIOQ+f7CJ5xOGefUxUnuZf0WZqjRMH1hieaYLSN21YafW2oaqZL/ML2xeSedC2nQm2ylAswvIvsEoV0gu0rvaVT1uGeZkLlrOMR6sOyyQMI0xO03th8s5o7PpFEBAWltfaf8RnVpzBeQKYYg1eUWpYbbnh417TdTYCn/mRqz8u0TtNHui4U5toSNIjyj3gMzlYCcuP9Z1tJRyF9DEvimcmBKXnxd8JAfVDc+QMxph22spRg4ssl7PauJRBaoplIjhvrIoABIOuLWGeSn+lmigJatON9CyuC4q9MD3APVRKVxaRUBm2Whmko4nSYSu2PBx6F5mOGvvGf3OrNYZfYPDxQqWm80S6seXn7vMMcNoXx/HEnhUW1HS1vfHBehM3aMQ4dyDT5EhQ4SKKyaVfFQ1kleVkjKr5exdppdUJA9N0yoJm9ywGG5dh7rgw4K3fNfux7A3TpcoXr3MSDKCBCRz2Qc15CIp2ARtb1CYrx9I3WQTHs+i1pj5mQ1ORbdD0kjvDxHwEbhSL5XPH+/VkJD337lWs+ANi+hgQY0MOi9Xzgklj9oFSguc5U5npQOmMVSF6/RVNTsxim3PZz+q12QYym7PiRwwcV43nf7ZRvy8bGqQeMG+tCYuEKCyECWV3ftv7MpAWbOVuzOCB+s4E6/2ZDqhrxFfV7HWlb9xbzDu6HcTu1csTPuak4UhJC4C5TyH0AUMRXu2sBtQfY8CLt+j/bCx6Zwz5t0y44B9nLIK+Yu8p1S1iBsp4hcZvcXr3LzZ6b/Do39CGKMX+j6NjIaubwXJ8djZIitnOp0ir+OEIfcBfYURE/AAp0V7a6pensJOQGZmCFxm+GUMNYBRwlbv8yYyw1oHLzaWVx6bSMO9NEuNcPhBU+C+/khLAfAuvT9Lms+6MLNVcll5MwKiQR/0lSkud9qkN9t0Nb04lLs8B4CgMMqpngSGffI+jrcAbADOVYrSp5fY5WzpqAle0abId+5aHU8meQT9aNRj8Qn4x7EbZVwH0HuOX72TgAuRLRj9h0TtwpO9SfpV6Xo+lZ6v2RmtS472thC5R+V7x+TESlJO2QTBTgHQY1Q1y2MVwu+FnvqHQWYPCcTHinDU1I3WZub6hRphdM8+9ZdQ90jZrbvZ8k91udJxbn4p5jz0NJNkGAuT/w5HckRPoV8WJ8+UdifeHEnzepMtF//5WVAZFkLec6BG6/sApXctzHch0wFVOqSUC5+750ThTrIzzUiDi3keF1nLVG3JZMQgr+A/PykLU446rv9/5HhowX2jsymg9gXuAWQQq2alwsyCccOGx8aUTt1Gfmvbi32Pe143/TwUknLUijqJ70J5TFD1DIy+h+d2e2n8Wx4MdBHYER05CABAxPVvdc8xXbo+XMEP85TET8QzA6bqAge8MmWMVRETVpXa0PLKl7a3RUSjodlkTK0yHOvFv3Rwn/sKFmXZ7Ac/IMiWbOwva8H4q13j8mnOJ12Eqg2l+ze/F9Uqj8KLCZRHAg5RJks9UE+aAEMZ68tejqImjleI0Ko/XEGIYM/DuHCnZTOP2YxYbOaYH8+F2TSATqsz8jvb67y5Ni0udfYDYW6ndiXLXyADXcXC/W40yua/xo/N/TrdSobVFN94c8lRywgtFA1Xus7IKbITmOGDXUDsELE69oRtlvlgVq+WFTxRkwhGiB+t7BTBVJeFBgfU35Mhwurrz25S1Rm36neAzp9iVpmqspwCXidcCiaLB8fBuFqxTIDycmGYm8L1pKAoM3XB5a3LinIUEpR+RQJrIyViz4mNCOUCRSk4NSeiglCYcscK/W3eXSN920aXlraPcyK0wKg1zjMHfDZ8C+OWJ82lOlJm5Bm/te6RLTnRRcb+xXHQTyhA0TSwXpq/7rcY9XknnHWQMw3SBT2Yj1nB8VAMCStttjwE9QoeR703r5q1j6KQaDYrw5v05d1DRl+xZSrQkHw3RbkCdQ7zDePtpzEIoH90LCsyrr+XwDccOutoWrAuHoOGgzneL35YBGgVbYHoMKfHM7GBkSD4oPi4bvQxKPnMQEk8E6o4pZH4WmsTg5RHmUirVdUzFO3XVrzXPRLBk5k/jrh0GTrVdjbWAgNMXTpK2UWy+nb37Ox/31k9gKSupRHWH2BuCN5vS4RvvStktgY72uexnFPFmk88ZT0Bj8d6CI0DGL7GpAXwa1naZz9HLX5TdjPxbErRyEEvKDPRJVTwxq1WSR673VrLkOxhgnWYB67GSf/KePfURXtchLjYFAsCAWNTriulLSt+sSrzz7YCEx+0pjPHUFxjaMT0ICQbquXWrEXubSjhfJBPF0HzGyuQLjN0C+X76QLkBLx1OmWNl3DFuoodm2Eq8kD2B7pmvDMYfOrS7HofweOm+a1iXoknA91piEMqXM871gc0xV9VI9cIapFH7gr5wbFFMwZUu6O/in5wK3HIfhT4Z9FymTiUiTrHPddV0NLU/frlFY/+TPXpaJye/WxjgsCDBqtq2VHXO0ZU7cMqlXgOYDAx7BLyDUpbBGLadlUtQn5RAQf3vX2Sa16syuymFIxZaxU7R2s/3M/FHHM10N9lS9m2HE/umQu+6LHb3Q85JG+LfRIKbKM1S6+EAypIgA0wKbRZ3/CARsxGg3ACNHqp1SQ7TIAVJtVc+SNxoesUZ6RO0QQAyxfadflAUfm89q8LPGt78qi0Dp0r3HRqkvdxDIdfCmBevXci2xxDdvtl5CBu/CLBg61h8za1fMCK+M2t8v7iq2ytV3b4SyxDQLrKsxlVUDZGdVM4SetDQJVj5AjkCNPf3TdfOYYs1GWmZEkhBxnf2ELfLHxOCnYlCxL/q3xrAQsLEB7AY2rxbAxqRJXdiBCYvVM+aYFtmMl1GOFI0MPyKffm/ou+xe4E74XO+qxBfLuvXnBtEj3mtGwSji4979J/fBTOQ22U1rb4WlG7U3s3pCpQAtC16G15vkk9IKWchMXy6+m59bT16aeKhSb0Aj9us/xjU5ymYTXN/TaQaLu1gMtF2PyWRefUAs6zE1h1QR9LAYBHCNMDxVzgtg5dMiFEKQfbK1M9IaFQsSkWOcYwMMW2oyum+iGUmW+tMJHYaGGITG5QZk613mr3KV9tlIuSAnjMvVmmFfskB+J/xfUfQ+g89rsrsWdyBPUm8y0/vzplfOcnZhHiuVc0D+ZA0KMifAM7KL9cgybAhiaX1nxpoZJ623tfB0UphOjjCNnpNjcJRC9m/p6cc+uMqAsg4f80flNJi1NDVaXO+aJJ2zam+iMra6wQ3Tx2AOuJvmKGv5TDTZ9mYS8z8nwmlXfKaa6LBUcue85Rblb8xgU4YVkxVTJxin9wcoXvUtOxPLf2WpnXTF9GdswO0YDjA==',
'__VIEWSTATEENCRYPTED':'',
'__EVENTVALIDATION':'Zwkhl3X9wmjd2hNAj9TBGeCIZd1XpEZAWvjTL6OJt6LQWe7RVnE8ZLoqtouHfJoMLhN+Rvep4oUIx8yo04tkWQcGjEhr/x8I9aL3NfHLFT0M9y+FGAaECQopNi5hq2h5pYfpFGa4vrU2YglCHPlUL/1IbTlsdXQOLnHjT9f95TEQVnDIDloVKLwuVEjltyCmAK2+vsQkstrFn3+MWaLpZZxZHqtbNbP1Jbj9kRmslAORZR2mA16uVSNZB+LRU1d9PcVc27mgSr/pWgZuhgEpF96ILUVsQ0DXDmpETMbLF6fAHNtfbTU/8/hYW5gMeP4IC19yI9WkkiV/GROFFUmk5icEK295mJpmbRmHQoyt+Hw='
}
for i in range(2,4):
    data['__EVENTARGUMENT'] = 'Page$'+str(i)
    data1= json.dumps(data)
    response = requests.post(url = url,data=data1 ,headers =headers )
    bs=BeautifulSoup(response.text,'html.parser')
    eventvalidation=bs.find(id="__EVENTVALIDATION")['value']
    viewstate=bs.find(id="__VIEWSTATE")['value']
    link=bs.find_all('td',class_="gv-item-left")
    for a in link:
      link_1=a.find('a',href=True)['href']
      print(link_1)
      sheet.append()
      content.append(link_1)
    #print(type(eventvalidation))
    #print(type(viewstate))
    data['__VIEWSTATE'] = viewstate
    data['__EVENTVALIDATION'] =eventvalidation
   
    print(i)
    #print(type(viewstate))
   
#print(viewstate)
#
wb.save('射阳备案网明细链接.xlsx')




我是想爬取备案网里面的链接明细,发现进后一页需要前一页的VIEWSTATE和EVENTVALIDATION,但是我更新进去还是一直爬取的是第一页,是哪里有问题呢

whh19921024 发表于 2020-3-22 10:17

我建议把爬到的html代码放到文件里面,本地访问下看下,这样才好找原因。

huayue 发表于 2020-3-22 12:44

whh19921024 发表于 2020-3-22 10:17
我建议把爬到的html代码放到文件里面,本地访问下看下,这样才好找原因。

我已经放了,就是会一直爬取第一页的内容

天黑我隐身 发表于 2020-3-22 14:54

```python
for i in range(2,4):
    data['__EVENTARGUMENT'] = 'Page$'+str(i)
    data1= json.dumps(data)
    response = requests.post(url = url,data=data1 ,headers =headers )
    pass
```
这里的data1,原网页传的就是form表单,不是json序列化数据,直接传入字典data就行
另外data同样有问题,除了第一页可以直接通过get或者post访问,其他页都会验证__VIEWSTATE和__EVENTVALIDATION字段,如果你直接从第二页开始,服务器会返回一个状态码500的页面,因为你这两个字段是错误的。必须先访问第一页,拿到这两个字段才行。你这段代码因为没有传入标准数据格式,所以服务器返回的都是第一页

##### 总结两点
* requests直接传入字典
* 从第一页开始获取数据或者首先获取第一页的两个字段

##### 额外加餐
基于生成器的实现
```python
def get_next_parse():
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    yield soup
    page = 2
    while True:
      data['__EVENTARGUMENT'] = 'Page$%i' % page
      data['__VIEWSTATE'] = soup.find(id='__VIEWSTATE')['value']
      data['__EVENTVALIDATION'] = soup.find(id='__EVENTVALIDATION')['value']
      r = requests.post(url, data, headers=headers)
      if r.status_code != 200:# 停止迭代条件
            break
      soup = BeautifulSoup(r.text, 'html.parser')
      yield soup
      page += 1


g = get_next_parse()
# 前四页
for i in range(4):
    soup = next(g)
    # Do something
   
# 所有数据
for soup in get_next_parse():
    pass
    # Do something
```

huayue 发表于 2020-3-25 15:03

天黑我隐身 发表于 2020-3-22 14:54
```python
for i in range(2,4):
    data['__EVENTARGUMENT'] = 'Page$'+str(i)


谢谢大神,我也发现了是我循环有问题,改了循环以后就正确了
页: [1]
查看完整版本: 求个大佬帮忙解决一下爬虫问题,翻遍了全网也不知道哪里错了