selenium 必应搜索获取标题及url
必应搜索,获取标题以及url## 代码
```python
import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import urllib.parse
def get_content(driver, url):
“”“获取每页的内容“””
driver.get(url)
time.sleep(sleep_time)
targets = driver.find_elements_by_xpath("//li[@class='b_algo']/h2/a")
infos = []
# 简单的保存一下标题和网址
for t in targets:
text = t.text
href = t.get_attribute("href")
url_parse = urllib.parse.urlparse(href)
domain = url_parse.scheme + "://" + url_parse.netloc
infos.append(text+"||"+domain+"||"+href+"\n")
return infos
def main(key, page=2):
# 网上下载对应的chromedriver
driver = webdriver.Chrome('./driver/chromedriver.exe')
# 设置隐式等待时间,然而我这好像没用,就在后面又加了time.sleep
driver.implicitly_wait(10)
url = f"https://cn.bing.com/search?q={key}&qs=n&sp=-1&pq=ni&sc=8-2&sk=&cvid=51AA598AE26B4774B0C37C165EB69C9B&first=1&FORM=PQRE1&ensearch=1"
result = []
for i in range(page):
result += get_content(driver, url)
print(f"page {i} over")
try:
# 尝试获取下一页的网址
el_next_page = driver.find_element_by_xpath("//a[@title='Next page']")
url = el_next_page.get_attribute("href")
except NoSuchElementException:
break
driver.quit()
with open(f"result{time.strftime('%H %M %S')}.txt", 'w', encoding='utf-8') as f:
f.writelines(result)
if __name__ == '__main__':
sleep_time = 1
main("123", page=2)
``` 捉个小虫,第8行的双引号中英文混用了,导致运行不了。 xyl52p 发表于 2021-9-13 22:54
捉个小虫,第8行的双引号中英文混用了,导致运行不了。
写注释没太留意{:1_907:}
页:
[1]