利用python爬取蓝光网首页的视频的磁力链接
- 利用
requests 库和pyquery 库爬取和解析数据,然后存储到mongodb 中
- 获取首页源代码并且解析每一个电影的详细页面链接
headers = {
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"
}
def get_index_page():
response = requests.get("http://www.languang.co/", headers = headers)
doc = pq(response.text)
divs = doc(".mi_cont>.mi_btcon")
for div in divs.items():
for li in div("li").items():
yield li("a").attr("href")
- 获取详情页面的源代码并且进行解析,然后把解析到的电影名字和磁力链接保存到mongodb中
def get_detail_info(detail_link):
response = requests.get(detail_link, headers = headers)
doc = pq(response.text)
name = doc(".moviedteail_tt>h1").html()
bt_link = []
lis = doc(".mikd>.mi_down_dy>.mi_ne_kd li")
for li in lis.items():
bt_link.append(li("a").attr("href"))
col.insert_one({
"name" : name,
"links": bt_link
})
print(name, "插入成功!")
全部源代码
import requests
from pyquery import PyQuery as pq
import pymongo
mongocli = pymongo.MongoClient("localhost")
col = mongocli["languang"]["movies"]
headers = {
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"
}
def get_index_page():
response = requests.get("http://www.languang.co/", headers = headers)
doc = pq(response.text)
divs = doc(".mi_cont>.mi_btcon")
for div in divs.items():
for li in div("li").items():
yield li("a").attr("href")
def get_detail_info(detail_link):
response = requests.get(detail_link, headers = headers)
doc = pq(response.text)
name = doc(".moviedteail_tt>h1").html()
bt_link = []
lis = doc(".mikd>.mi_down_dy>.mi_ne_kd li")
for li in lis.items():
bt_link.append(li("a").attr("href"))
col.insert_one({
"name" : name,
"links": bt_link
})
print(name, "插入成功!")
def main():
detail_page = get_index_page()
for detail_link in detail_page:
get_detail_info(detail_link)
if __name__ == "__main__":
main()
希望大家多多指教!!!
|