xiaojipkhuang1 发表于 2024-8-30 09:57

请教大神们一个Python selenium问题

import re
import threading
import time
import driver
import pymysql
import requests
from selenium import webdriver
import time
import random
import xlwt
from selenium.webdriver.common.by import By
import smtplib
from email.mime.text import MIMEText
from email.header import Header
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib3.util import wait
import datetime
from datetime import datetime, timedelta
from apscheduler.schedulers.blocking import BlockingScheduler
import pymssql
import csv
import os


def get_ts(keyword, text):
    pattern = r'{}[\u4e00-\u9fa5]*([\d\.]+)'.format(keyword)
    match = re.search(pattern, text)
    if match:
      return match.group(1)# 输出数字
    else:
      return "1"


def get_gs(keyword, text):
    pattern = fr"(\d+){keyword}"
    match = re.search(pattern, text)
    if match:
      return match.group(1)# 输出数字
    else:
      return "0"


def get_yp(keyword, text):
    pattern = fr"(\d+){keyword}"
    match = re.search(pattern, text)
    if match:
      return match.group(1)# 输出数字
    else:
      return "0"


def get_tnr(keyword, text):
    pattern = r'{}[\u4e00-\u9fa5]*([\d\.]+)'.format(keyword)
    match = re.search(pattern, text)
    if match:
      return match.group(1)# 输出数字
    else:
      return "0"


# def my_job():
#   # 执行JavaScript来刷新页面
#   driver.execute_script("window.location.reload();")
#   time.sleep(random.randint(2, 3))
#   driver.find_element(by=By.XPATH, value='//*[@id="layout"]/div/div/div/div/div/span/span').click()
#   time.sleep(random.randint(3, 4))
#   driver.find_element(by=By.XPATH, value='//*[@id="layout"]/div/div/div/div/div/span/span').click()
#   get_data()
    # work_book.save("盼之.xls")


def extract_datetime(text):
    pattern = r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}'
    match = re.search(pattern, text)
    if match:
      return match.group()
    else:
      return None

    # 配置邮箱发送信息


def con():
    db = pymysql.connect(
      port=3306,
      user='root',
      password='cyj19950610',
      db='mysql',
      charset='utf8'
    )


# 计算页面文本中包含关键字的数量


class save:
    con()
    db = pymysql.connect(
      port=3306,
      user='root',
      password='cyj19950610',
      db='mysql',
      charset='utf8'
    )
    cursor = db.cursor()
    try:
      sql = 'insert ignoreinto db_px_wwqy (bh, nss, amount, rate, info_time) values(%s, %s, %s, %s, %s)'
    except:
      print("数据有问题")




def get_data():
    # headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)
    # AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53",
    #          "Host": "g-search1.alicdn.com"}
    global col
    global row
    # divs = driver.find_elements(by=By.XPATH,
    #                           value='//*[@id="top"]/div/div/div/div/a/div/div/div')# 所有的div标签
    divs = driver.find_elements(by=By.XPATH, value='//div[@class="game_list"]/div/div')
    print(divs)
    for div in divs:
      nss = div.find_element(by=By.XPATH,
                               value='.//div/div/a/div/div').text
      amount = div.find_element(by=By.XPATH,
                                  value='//div/div/div').text
      # nss = div.find_element(by=By.XPATH,
      #                        value='//*[@id="top"]/div/div/div/div/a/div/div/div/h4').text
      # amount = div.find_element(by=By.XPATH,
      #                           value='//*[@id="top"]/div/div/div/div/a/div/div').text
      threshold = 300
      text = nss
      amount = get_ts("¥", amount)
      info_time = datetime.now()
      int_amount = round(int(amount))
      bh = extracted_text = text[:10]
      ds = get_ts("无畏点数", nss)
      rate = round(int(ds) * 0.04)
      print(f"文本: {nss}")
      print(f"金额: {amount}")
      print(f"折损: {rate}")
      # 连接到MySQL数据库
      connection = pymysql.connect(host='localhost', user='root', password='cyj19950610', db='mysql')
      with connection.cursor() as cursor:
            # 编写SQL查询语句,假设表名为your_table,检查字段your_field是否包含数据data_on_page
            sql = "SELECT 1 FROM db_px_wwqy WHERE bh=%s"
            cursor.execute(sql, bh)
            # 获取查询结果
            result = cursor.fetchone()

      connection.close()
      product = {'编号': bh,
                   '内容': nss,
                   '金额': amount,
                   '折损': rate,
                   '发布时间': info_time
                   }
      bh = product.get('编号')
      nss = product.get('内容')
      amount = product.get('金额')
      rate = product.get('折损')
      info_time = product.get('发布时间')
      save.cursor.execute(save.sql,
                            (bh, nss, amount, rate, info_time))
      save.db.commit()

if __name__ == '__main__':
    # div_list = ["编号", "天赏石", "天霓染", "金额", "天赏均价", "发布时间"]
    # col = 0# 设置行、列
    # row = 1
    # work_book = xlwt.Workbook(encoding='utf-8')# 创建工作簿
    # work_sheet = work_book.add_sheet('tb_shopping')# 创建一张表
    # pattern = xlwt.Pattern()# 设置单元格
    # pattern.pattern = xlwt.Pattern.SOLID_PATTERN
    # pattern.pattern_fore_colour = 5
    # for i in range(6):# 设置标头
    #   work_sheet.col(i).width = 4444
    #   work_sheet.write(0, i, div_list)
    opts = webdriver.ChromeOptions()
    opts.headless = True
    # opts.add_argument('-headless')
    chrome_options = webdriver.ChromeOptions()
    # chrome_options.add_argument("--headless")# 设置Chrome为无头模式
    chrome_options.add_argument('--ignore-certificate-errors')# 忽略证书错误
    opts.add_argument("--disable-3d-apis")
    opts.add_argument('log-level=3')
    chrome_options.add_argument('blink-settings=imagesEnabled=false')
    driver = webdriver.Chrome(options=opts)
    driver.get('https://www.pxb7.com/selectgame?game_id=148&game_alias=games&gameAlias=wwqy')
    driver.implicitly_wait(1)
    driver.maximize_window()
    driver.find_element(by=By.XPATH, value='//*[@id="layout"]/div/div/div/div/div/span/span').click()
    time.sleep(random.randint(2, 3))
    driver.find_element(by=By.XPATH, value='//*[@id="layout"]/div/div/div/div/div/span/span').click()
    time.sleep(random.randint(2, 3))
    get_data()
    # # 创建调度器
    # scheduler = BlockingScheduler()
    # # 添加任务:间隔30秒执行一次
    # scheduler.add_job(my_job, 'interval', seconds=10)
    # scheduler.start()



这是我的代码

numbersi 发表于 2024-8-30 10:09

是不是动态加载的原因

xiaojipkhuang1 发表于 2024-8-30 10:11

numbersi 发表于 2024-8-30 10:09
是不是动态加载的原因

动态加载的话但是这个xpath位置都是不变的诶,只是内容变化

numbersi 发表于 2024-8-30 10:16

前端vue写的,动态加载的,dom都是动态生成的,你静态获取是空

xiaojipkhuang1 发表于 2024-8-30 10:19

numbersi 发表于 2024-8-30 10:16
前端vue写的,动态加载的,dom都是动态生成的,你静态获取是空

啊,那这种有啥办法可以获取内容吗

uuwatch 发表于 2024-8-30 10:22

//*[@id="exposureType:productList|productId:297|gameId:148|price:60|index:1|rcToken:148_297^1724984233_1724984233_4^57_"]/div/a/span
//div[@class="game_list"]/div/div/div]/div/a/div/div
要不你试试用正则匹配?

numbersi 发表于 2024-8-30 10:25

xiaojipkhuang1 发表于 2024-8-30 10:19
啊,那这种有啥办法可以获取内容吗

动态获取你看api啊

xiaojipkhuang1 发表于 2024-8-30 10:26

uuwatch 发表于 2024-8-30 10:22
//*[@id="exposureType:productList|productId:297|gameId:148|price:60|index:1|rcToken:148_297^17249842 ...

正则的话这个xpath是动态的

xiaojipkhuang1 发表于 2024-8-30 10:27

numbersi 发表于 2024-8-30 10:25
动态获取你看api啊

好吧,动态的话get的地址也是动态变化的,参数我都逆向了,就是请求的地址有个加密的逆不出来

qianxiaohe 发表于 2024-8-30 10:45

可以换掉selenium用试试playwright库。安装库命令pip install playwright,安装对应的浏览器和驱动命令playwright install,录制命令python -m playwright codegen 网址
页: [1] 2 3
查看完整版本: 请教大神们一个Python selenium问题