heaven096 发表于 2019-7-9 11:32

爬虫小白练习用python爬取网站数据,pycharm没有报错也没有输出

pycharm没有报错也没有输出,,找了好几天的原因没找出来https://tb2.bdstatic.com/tb/editor/images/face/i_f09.png?t=20140803https://tb2.bdstatic.com/tb/editor/images/face/i_f09.png?t=20140803https://tb2.bdstatic.com/tb/editor/images/face/i_f09.png?t=20140803源代码如下:#-*- coding:utf-8 -*-
from lxml import etree
import requests
import csv
import time

def writecsv(item,name):
   file_name = name
   with open(file_name,'a',encoding = 'utf-8',errors='ignore', newline='') as f:
       writer = csv.writer(f)
       writer.writerows(item)


if __name__ == '__main__':
   headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
   start_url = 'http://shenzhen.qfang.com/sale/f'
   for x in range(1,4):
       url = start_url + str(x)
       html = requests.get(url,headers = headers)
       time.sleep(1)
       selector = etree.HTML(html.text)   #构造选择器
       # 获取房源列表
       house_list = selector.xpath('//*[@id="cycleListings"]/ul/li')
       for house in house_list:
         apartment = house.xpath('div/p/a/text()')# 房源简介,用text()方法获取文本信息
         layout = house.xpath('div/p/span/text()')# 获取户型
         area = house.xpath('div/p/span/text()')# 面积
         floor = house.xpath('div/p/span/text()')# 楼层
         region = house.xpath('div/p/span/text()')# 地址
         total_price = house.xpath('div/span/text()')# 总价格
         price = house.xpath('div/p/text()')# 单价
         item =
         writecsv(item,'qfang.csv')
         print('正在抓取',apartment)

果汁分妳一半 发表于 2019-7-9 11:48

本帖最后由 果汁分妳一半 于 2019-7-9 18:01 编辑

给你顶个贴:rggrg

小葫蘆 发表于 2019-7-9 12:12

注意网页中有没有IFRAME

Hatsune_miku 发表于 2019-7-9 12:17

你根本没有得到数据。
https://i.loli.net/2019/07/09/5d2415398d3a168827.png

https://i.loli.net/2019/07/09/5d24155e2cd9766524.png

bhblinux 发表于 2019-7-9 12:21

house_list 为空所以反应

bhblinux 发表于 2019-7-9 12:32

这个网站有反扒机制,这个方式爬去不到数据的

Hatsune_miku 发表于 2019-7-9 12:44

给你改了一下,你代码写的没啥问题。主要是没注意这个网站有防爬虫,你得带cookie。
from lxml import etree
import requests
import csv
import time

def writecsv(item,name):
    file_name = name
    with open(file_name,'a',encoding = 'utf-8',errors='ignore', newline='') as f:
      writer = csv.writer(f)
      writer.writerows(item)

def main():
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36', 'cookie': 'sec_tc=AQAAALzdrQT7PwsAy26zGmKPO2pZzGGb; acw_tc=7250b31d15626470107378682edaf31ff9f3c7faef9a74479816f2d253; sid=3a8bc0c3-6342-4c83-ac75-39c0c16c16c7; cookieId=7d66083d-2348-4c9d-a581-6f135371b219; qchatid=88e66c82-6651-4caf-a790-0bee5858764a; language=SIMPLIFIED; JSESSIONID=aaapkk1qfzzdw_eJQbvVw; acw_sc__v2=5d241a4783955b7e1c6eee4efd16b61b8998fe86; CITY_NAME=SHENZHEN; WINDOW_DEVICE_PIXEL_RATIO=1; Hm_lvt_de678bd934b065f76f05705d4e7b662c=1562647117; _jzqa=1.22871634995936444.1562647117.1562647117.1562647117.1; _jzqc=1; _jzqx=1.1562647117.1562647117.1.jzqsr=shenzhen%2Eqfang%2Ecom|jzqct=/sale/f1.-; _jzqckmp=1; _ga=GA1.3.1191167714.1562647117; _gid=GA1.3.1281729903.1562647117; _qzjc=1; _qzja=1.440288992.1562647116816.1562647116816.1562647116817.1562647124059.1562647127518.0.0.0.3.1; _qzjb=1.1562647116816.3.0.0.0; _qzjto=3.1.0; _jzqb=1.3.10.1562647117.1; Hm_lpvt_de678bd934b065f76f05705d4e7b662c=1562647128'}
    for x in range(1,4):
      url = f'https://shenzhen.qfang.com/sale/f{x}'
      r = requests.get(url, headers = headers)
      page = etree.HTML(r.text)
      house_list = page.xpath('//*[@id="cycleListings"]/ul/li')
      for house in house_list:
            apartment = house.xpath('div/p/a/text()')# 房源简介,用text()方法获取文本信息
            layout = house.xpath('div/p/span/text()')# 获取户型
            area = house.xpath('div/p/span/text()')# 面积
            floor = house.xpath('div/p/span/text()')# 楼层
            region = house.xpath('div/p/span/text()')# 地址
            total_price = house.xpath('div/span/text()')# 总价格
            price = house.xpath('div/p/text()')# 单价
            item =
            writecsv(item,'qfang.csv')
            print('正在抓取',apartment)

if __name__ == '__main__':
    main()

Hatsune_miku 发表于 2019-7-9 12:45

Hatsune_miku 发表于 2019-7-9 12:44
给你改了一下,你代码写的没啥问题。主要是没注意这个网站有防爬虫,你得带cookie。


https://i.loli.net/2019/07/09/5d241b7d48ed194025.png

wuyoukm 发表于 2019-7-9 13:57

给你改的能用了,主要原因是,没有携带cookie,并且我发现 cookie会失效,所以你还是要去破解一下cookie生成的JS文件,还有,语法其实也有问题,你自己慢慢看吧

from lxml import etree
import requests
import csv
import time

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
    'cookie': 'cw_tc=7250b31e15626454505637126e49061dae3656054ec8ebad60179b4c90; qchatid=39c4e0a7-00b2-4cd0-bbbf-26d3e6433ebf; language=SIMPLIFIED; JSESSIONID=aaaEzLqo3xaHr9dav7uVw; WINDOW_DEVICE_PIXEL_RATIO=1; _ga=GA1.3.769635160.1562645455; _gid=GA1.3.599078110.1562645455; Hm_lvt_de678bd934b065f76f05705d4e7b662c=1562645455; _jzqa=1.3770715335462188000.1562645515.1562645515.1562645515.1; _jzqc=1; _jzqckmp=1; sid=96f8e12f-0438-4faa-9bba-1163d8024186; SALEROOMREADRECORDCOOKIE=100644811; looks=SALE%2C100644811%2C55538; historyKeywords_SHANGHAI_SALE=%E6%B1%A4%E8%87%A3%E4%B8%80%E5%93%81; historyKeywords_SHANGHAI_NEWHOUSE=%E5%9B%9B%E5%90%88%E9%99%A2|%E6%B1%A4%E8%87%A3%E4%B8%80%E5%93%81; historyKeywords_BEIJING_SALE=%E5%9B%9B%E5%90%88%E9%99%A2; historyKeywords=%E5%B0%96%E6%B2%99%E5%92%80; searchPersonIds=213995097a213772013a2214223a213995003; cookieId=d2b2d395-b904-4817-8825-4a6c737cded5; HOUSE_PRICE_TOKEN=374391848b084135a1e45e27e94d68e8; sec_tc=AQAAALBB3QN54AkAfXkhKGP4oY8lprCj; acw_sc__v2=5d2428f687e081ce36da9330ba33d97f756d7097; CITY_NAME=SHENZHEN; _qzjc=1; _dc_gtm_UA-47416713-1=1; _qzja=1.269318586.1562645515482.1562645515483.1562650874632.1562650874632.1562651053750.0.0.0.11.2; _qzjb=1.1562650874632.2.0.0.0; _qzjto=11.2.0; _jzqb=1.31.10.1562645515.1; Hm_lpvt_de678bd934b065f76f05705d4e7b662c=1562651054',
}
start_url = 'https://shenzhen.qfang.com/sale/f'


def main(writer):
    for x in range(1, 5):
      url = start_url + str(x)
      html = requests.get(url, headers=headers)
      time.sleep(1)
      selector = etree.HTML(html.text)# 构造选择器
      # 获取房源列表
      house_list = selector.xpath('//*[@id="cycleListings"]/ul/li')
      for house in house_list:
            apartment = house.xpath('./div/p/a/text()').strip()# 房源简介,用text()方法获取文本信息
            layout = house.xpath('./div/p/span/text()').strip()# 获取户型
            area = house.xpath('./div/p/span/text()').strip()# 面积
            floor = house.xpath('./div/p/span/text()').strip()# 楼层
            one = house.xpath('./div/p/span/a/text()').strip()
            try:
                two = house.xpath('./div/p/span/a/text()').strip()
                three = house.xpath('./div/p/span/a/text()').strip()
            except IndexError:
                region = one
            else:
                region = one+"-"+two+"-"+three
            total_price = house.xpath('./div/span/text()').strip()# 总价格
            price = house.xpath('./div/p/text()')# 单价
            item = (apartment, layout, area, floor, region, total_price+"万", price)
            writer.writerow(item)
            print('正在抓取', region)
if __name__ == '__main__':
    with open('qfang.csv', 'w+', encoding='utf-8-sig', newline='') as csvf:
      writer = csv.writer(csvf)
      writer.writerow(('房源简介', '房源户型', '房源面积', '房源楼层','房源地址','房源总价'))
      main(writer)

黄健龙 发表于 2019-7-9 14:33

大神能分享个淘宝爬虫么
页: [1] 2
查看完整版本: 爬虫小白练习用python爬取网站数据,pycharm没有报错也没有输出