python selector.css方法爬不到数据求助
import requestsimport parsel
from parsel import Selector
headers={
'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Mobile Safari/537.36'
}
url='https://www.gov.cn/zhengce/zuixin/home_4.htm'
response = requests.get(url,headers=headers)
response.encoding='utf-8'
html = response.text
#print(html)
selector=parsel.Selector(html)
print(selector)
policy=selector.css('.news_box .list a::text').getall()
date=selector.css('.news_box .list span.date::text').getall()
href=selector.css('.news_box .list a::attr(href)').getall()
print(policy,date,href)
现象: html 可以获取到数据selector打印也有数据 ,但是后面的三个列表都是空值,问题出在哪里呢?
另外在开发者模式下,Ctrl+F 查找时,.news_box .list a 能够找到20条数据
请问如何修改这段代码 断点调试下 其实只是你直接用requests抓下来的html里面多了个</html>标签 然后把后面的内容全阻断了而已 本帖最后由 很快再相见123 于 2023-10-7 00:52 编辑
参考官网api
https://parsel.readthedocs.io/en/latest/
import requests
# import parsel
from parsel import Selector
def main_func():
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Mobile Safari/537.36'
}
url = 'https://www.gov.cn/zhengce/zuixin/home_4.htm'
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
html = response.text
# print(html)
selector = Selector(str(html).replace('<!DOCTYPE html>', ''))
print(selector)
policy = selector.css('.news_box .list a::text').getall()
date = selector.css('.news_box .list span.date::text').getall()
href = selector.css('.news_box .list a::attr(href)').getall()
print(policy, date, href)
if __name__ == '__main__':
main_func()
get请求的结果str,删除<!DOCTYPE html>即可
(官网啥都有,多看看,没坏处)
parsel模块众所周知是一个python的第三方库,其作用和功能等价于css选择器,xpath和re的集合版。和其他解析模块相比,例如BeautifulSoup,xpath等,parsel效率更高,使用更简单。 很快再相见123 发表于 2023-10-7 00:51
参考官网api
https://parsel.readthedocs.io/en/latest/
谢谢 我用你的方法确实可以获取到数据,但是还是有一点不明白。另外一个网站也是以<!DOCTYPE html>开头,结果不受影响。
#广东省人民政府网站
import requests
import parsel
from parsel import Selector
headers={
'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Mobile Safari/537.36'
}
url='http://www.gd.gov.cn/gdywdt/gdyw/index.html'
response = requests.get(url,headers=headers)
response.encoding='utf-8'
html = response.text
#print(html)
s=parsel.Selector(html)
print(s)
policy=s.css('.viewList ul li a::text').getall()
time=s.css('.viewList ul li .time::text').getall()
href=s.css('.viewList ul li a::attr(href)').getall()
print(policy,time,href)
longzhouming 发表于 2023-10-6 23:48
其实只是你直接用requests抓下来的html里面多了个标签 然后把后面的内容全阻断了而已
谢谢你的提示,方法有效,是不是有什么工具,可以检测网页代码?这么快就找到问题了。
import requests
import parsel
from parsel import Selector
import re
headers={
'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Mobile Safari/537.36'
}
url='https://www.gov.cn/zhengce/zuixin/home_4.htm'
response = requests.get(url,headers=headers)
response.encoding='utf-8'
html_code = response.text
new_html_code = re.sub(r'</html>', '', html_code, count=1)
selector=parsel.Selector(new_html_code)
print(selector)
policy=selector.css('.news_box .list a::text').getall()
date=selector.css('.news_box .list span.date::text').getall()
href=selector.css('.news_box .list a::attr(href)').getall()
print(policy,date,href)
本帖最后由 sai609 于 2023-10-7 13:47 编辑
有用的爬不了,没用的爬再多也还是没用
页:
[1]