爬取腾讯新闻的接口脚本
# 使用Scrapy框架爬取腾讯新闻的基本套路```python
# -*- coding: utf-8 -*-
import json
from scrapy import Spider
from scrapy.http import Request
from scrapy.http import Response
from scrapy.http import FormRequest
from scrapy.selector import Selector
from bs4 import BeautifulSoup
from ..items import NewsItem
TencentNewsUrl = 'https://pacaio.match.qq.com/irs/rcd'
# 要闻 https://pacaio.match.qq.com/pc/topNews?callback=__jp0
# https://pacaio.match.qq.com/irs/rcd?cid=108&ext=&token=349ee24cdf9327a050ddad8c166bd3e3&page=1&expIds=&callback=__jp1
# https://new.qq.com/cmsn/20180726/20180726A0QOLA00
# https://new.qq.com/ omn/20180726/20180726A0QOLA.html
class TencentSpider(Spider):
name = 'tencent'
def start_requests(self):
# yield Request(
# url='https://pacaio.match.qq.com/pc/topNews?callback=__jp0',
# callback=self.parse_contents
# )
yield FormRequest(
url=TencentNewsUrl,
formdata={
"cid": "58",
"token": "c232b098ee7611faeffc46409e836360",
"ext": "milite",
"page": "0",
"expIds": "",
"callback": "__jp0"
},
callback=self.parse_contents,
meta={
"page": "0",
"field": ""
}
)
def parse_contents(self, response: Response):
try:
data = json.load(response.text)
except Exception:
data = json.loads(response.text[(response.text.find('(') + 1):response.text.rfind(')')])
# 处理分离网页
try:
data = data['data']
except Exception:
pass
for url in data:
omn = url['vurl']
if omn.endswith('00') and '/cmsn/' in omn:
omn = omn.replace('/cmsn/', '/omn/')
omn = omn[:omn.rfind('00')] + '.html'
print(omn)
yield Request(
url=omn,
callback=self.parse_news
)
break
def parse_news(self, response: Response):
news = NewsItem()
news['url'] = response.url
soup = BeautifulSoup(response.text, "lxml")
news['title'] = soup.find('div', class_='LEFT').h1.text
news['content'] = ''
article = soup.find_all('p', class_='one-p')
for sentence in article:
news['content'] += sentence.text
return news
``` and1=1 发表于 2018-7-27 22:22
scrapy模块怎么安装的,是要去下个python2.7吗?
这是python3的程序,安卓了py3之后可以使用pip安装 alongwy 发表于 2018-8-3 07:24
这是python3的程序,安卓了py3之后可以使用pip安装
谢谢,我已经解决了 可以牛逼 scrapy模块怎么安装的,是要去下个python2.7吗? 支持下,给你加个油{:1_921:} 这个有什么用 and1=1 发表于 2018-7-27 22:22
scrapy模块怎么安装的,是要去下个python2.7吗?
装好依赖包,本地安装 感谢大神!
你还有很多代码没有放出来!{:301_995:}
页:
[1]
2