我又针对国家应急部下手了,但是etree出现了点问题
为啥我用etree做的时候,找不到table目录下的tr和td啊,不合理呀。这个代码是能找到table目录的,遍历结果是3,但是....
import requests
import csv
import re
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15'
}
accidents = {}
for i in range(6):
if i < 1:
url = 'https://www.mem.gov.cn/gk/sgcc/tbzdsgdcbg/index.shtml'
else:
url = 'https://www.mem.gov.cn/gk/sgcc/tbzdsgdcbg/index_%s.shtml' % i
print(url)
req = requests.get(url, headers=headers)
# print(req.text)
table_1 = etree.HTML(req.text).xpath("//table[@width='790']")
print(len(table_1))
# accident = {}
# for j in range(len(table_1)):
# accident['title'] = table_1.xpath('td/a/text()')
# accident['time'] = table_1.xpath('td/a/span/text()')
# accident['href'] = table_1.xpath('td/a/@href')
# print(accident)
如果我加上tr 或 td
遍历结果是0
import requests
import csv
import re
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15'
}
accidents = {}
for i in range(6):
if i < 1:
url = 'https://www.mem.gov.cn/gk/sgcc/tbzdsgdcbg/index.shtml'
else:
url = 'https://www.mem.gov.cn/gk/sgcc/tbzdsgdcbg/index_%s.shtml' % i
print(url)
req = requests.get(url, headers=headers)
# print(req.text)
table_1 = etree.HTML(req.text).xpath("//table[@width='790']/tr/td")
print(len(table_1))
# accident = {}
# for j in range(len(table_1)):
# accident['title'] = table_1.xpath('td/a/text()')
# accident['time'] = table_1.xpath('td/a/span/text()')
# accident['href'] = table_1.xpath('td/a/@href')
# print(accident)
自己解决了,不好意思。
但是我还是不太明白为什么{:301_971:}
import requests
import csv
import re
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15'
}
accidents = {}
for i in range(6):
if i < 1:
url = 'https://www.mem.gov.cn/gk/sgcc/tbzdsgdcbg/index.shtml'
else:
url = 'https://www.mem.gov.cn/gk/sgcc/tbzdsgdcbg/index_%s.shtml' % i
print(url)
req = requests.get(url, headers=headers)
req.encoding = req.apparent_encoding
table = etree.HTML(req.text).xpath("//table[@width='790']")
accident = {}
for j in range(len(table)):
table_tr = table.xpath('tr')
for k in range(len(table_tr)):
accident['title'] = table_tr.xpath('td/a/text()')
accident['time'] = table_tr.xpath('td/a/span/text()')
accident['href'] = table_tr.xpath('td/a/@href')
print(accident)
hj170520 发表于 2020-12-19 23:30
自己解决了,不好意思。
但是我还是不太明白为什么
import request ...
import requests
import csv
import re
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15'}
accidents = {}
for i in range(6):
if i < 1:
url = 'https://www.mem.gov.cn/gk/sgcc/tbzdsgdcbg/index.shtml'
else:
url = 'https://www.mem.gov.cn/gk/sgcc/tbzdsgdcbg/index_%s.shtml' % i
print(url)
req = requests.get(url, headers=headers).text.encode('raw_unicode_escape').decode()
# print(req)
table = etree.HTML(req)
a=table.xpath('//*[@width="790"]/tr/td/a/text()')
print(a) 谢谢啊啊,看不懂哈哈哈尴尬 本帖最后由 姓木名木木 于 2020-12-20 01:12 编辑
第一次抓的等于3,你抓的xpath路径就是三块啊(你进官网一眼就可以看到分成三部分)
建议直接抓完整路径//*[@width="790"]/tr/td/a/text() 姓木名木木 发表于 2020-12-20 01:10
第一次抓的等于3,你抓的xpath路径就是三块啊(你进官网一眼就可以看到分成三部分)
建议直接抓完整路径// ...
我只是在html元素分析时可以遍历到所有的tr,在python里没实现出来。
后来自己换了个方式实现了。 青山绿水meng 发表于 2020-12-19 23:47
import requests
import csv
import re
感谢,还帮我转了个码{:301_986:}
页:
[1]