在通义千问的帮助下，做的一个爬虫脚本，大模型真是越来越厉害

iprogramer 发表于 2024-6-26 22:31

本帖最后由 iprogramer 于 2024-6-26 22:39 编辑

import requests
from bs4 import BeautifulSoup
import os
import re
import base64
import logging

html_template = """
<!DOCTYPE html>
<html lang="en">
<title>{title}</title>
<head>
<meta charset="UTF-8">
<style>{style}
</style>
</head>
<body class='container'>
{content}
</body>

</html>
"""

style = """
body {
   font-family: Arial, sans-serif;
}
.container {
   width: 700px;
   margin: 0 auto; /* 居中 */
   margin-top:30px;
}
.container img { /* 添加这一段来让图片水平居中 */
   display: block; /* 将图片视为块级元素以便应用margin */
   margin: 0 auto; /* 实现水平居中 */
   max-width: 60%; /* 确保图片不超过容器宽度 */
   height: auto; /* 保持图片原始宽高比 */
}
.container h1 {
   font-size:36px;
   margin: 0 auto; /* 居中 */
}
"""

def setup_logger():
#设置日志配置
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class WebScraper:
def __init__(self, start_id=0):
   self.start_id = start_id
   self.last_id_path = "last_id.txt"

def load_last_id(self):
   """加载上次处理的ID"""
   if os.path.exists(self.last_id_path):
         with open(self.last_id_path, 'r') as file:
            return int(file.read().strip())
   return self.start_id

@staticmethod
def save_last_id(last_id):
   #保存已处理的ID
   with open("last_id.txt", "w") as file:
         file.write(str(last_id))

@staticmethod
def make_request(url):
   #发送网络请求，增加异常处理
   try:
         response = requests.get(url)
         response.raise_for_status()# 抛出HTTP错误
         return response
   except requests.RequestException as e:
         print(f"请求错误: {e}")
         return None

@staticmethod
def sanitize_filename(file_name):
   #清理文件名中的非法字符
   return re.sub(r'[^\w\s-]', '', file_name).strip().replace(' ', '_')

def parse_webpage(self, response):
   #解析网页内容，提取标题、正文、时间等
   soup = BeautifulSoup(response.text, 'html.parser')
   title = soup.find('h1').text.strip()
   article_body = soup.find(class_='article-body')
   pub_date_tag = soup.find(class_='time')
   pub_date = pub_date_tag.text.strip().replace("-", "") if pub_date_tag else ""
   return title, article_body, pub_date

def download_and_encode_image(self, img_url):
   #下载图片并转换为Base64编码
   response = self.make_request(img_url)
   if response:
         return base64.b64encode(response.content).decode('utf-8')
   return None

def embed_images_in_html(self, soup, images_base64):
   #将图片Base64编码嵌入到HTML中
   for img in soup.find_all('img'):
         img_src = img.get('src')
         if img_src and img_src.startswith(('http:', 'https:')):
            base64_img = images_base64.get(img_src)
            if base64_img:
               img['src'] = base64_img

def scrape_and_process(self, url):
   #整合所有步骤处理单个网页
   response = self.make_request(url)
   if not response:
         return

   title, article_body, pub_date = self.parse_webpage(response)
   title_sanitized = self.sanitize_filename(title)

   images_base64 = {img.get('src'): self.download_and_encode_image(img.get('src'))
                     for img in article_body.find_all('img')
                     if img.get('src').startswith(('http:', 'https:'))}

   self.embed_images_in_html(article_body, images_base64)

   html_body = html_template.format(title=title_sanitized, content=str(article_body), style=style)

   author = '安全内参'
   file_name = f"[{author}] - {pub_date} - {title_sanitized}.html"
   year, month, day = pub_date[:4], pub_date, pub_date
   folder_to_save="D:\\微信文件"
   target_dir = f"{folder_to_save}\{author}\{year}\{month}\{day}"
   os.makedirs(target_dir, exist_ok=True)# 使用exist_ok避免检查目录是否存在
   f_name = os.path.join(target_dir, file_name)

   try:
         with open(f_name, 'w', encoding='utf-8') as file:
            file.write(html_body)
         logging.info(f'文章已成功保存至：{f_name}')
   except IOError as e:
         logging.error(f'保存文件时发生错误：{e}')

def run(self):
   #主循环，处理一系列网页
   last_id = self.load_last_id()
   for i in range(last_id, 90000):
         url = f'https://www.secrss.com/articles/{i}'
         print(f"正在处理：{url}")
         self.scrape_and_process(url)
         self.save_last_id(i + 1)

if __name__ == '__main__':
setup_logger()
scraper = WebScraper()
scraper.run()

iprogramer 发表于 2024-6-26 22:31

刚学了python半个月，希望高手能修改成多线程的，爬的太慢了

andyop 发表于 2024-6-26 23:19

iprogramer 发表于 2024-6-26 22:31
刚学了python半个月，希望高手能修改成多线程的，爬的太慢了

grequests

iprogramer 发表于 2024-6-26 23:24

import requests
from bs4 import BeautifulSoup
import base64
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
import re
import logging
import os

html_template = """
<!DOCTYPE html>
<html lang="en">
<title>{title}</title>
<head>
<meta charset="UTF-8">
<style>{style}
</style>
</head>
<body class='container'>
{content}
</body>

</html>
"""

style = """
body {
   font-family: Arial, sans-serif;
}
.container {
   width: 700px;
   margin: 0 auto; /* 居中 */
   margin-top:30px;
}
.container img { /* 添加这一段来让图片水平居中 */
   display: block; /* 将图片视为块级元素以便应用margin */
   margin: 0 auto; /* 实现水平居中 */
   max-width: 60%; /* 确保图片不超过容器宽度 */
   height: auto; /* 保持图片原始宽高比 */
}
.container h1 {
   font-size:36px;
   margin: 0 auto; /* 居中 */
}
"""

# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def download_image_to_base64(url: str) -> str:
"""下载图片并转换为Base64编码"""
try:
   response = requests.get(url)
   response.raise_for_status()
   encoded_image = base64.b64encode(response.content).decode('utf-8')
   return f"data:image/jpeg;base64,{encoded_image}"
except requests.RequestException as e:
   logging.error(f"网络请求错误: {e}")
   return None

def save_progress(i: int) -> None:
"""保存进度到文件"""
Path("last_id.txt").write_text(str(i))

def load_progress() -> int:
"""从文件加载进度"""
if Path("last_id.txt").exists():
   return int(Path("last_id.txt").read_text())
return 0

def sanitize_filename(filename: str) -> str:
"""清理文件名中的非法字符"""
return re.sub(r'[^\w\s\.-]', '', filename).strip().replace('_', ' ').replace('.', '_')

def scrape_and_embed_images(webpage_url: str, last_id: int) -> None:
"""爬取网页内容，并将图片转换为Base64编码嵌入HTML"""
try:
   response = requests.get(webpage_url)
   response.raise_for_status()

   soup = BeautifulSoup(response.text, 'html.parser')
   images = {img.get('src'): download_image_to_base64(img.get('src')) for img in soup.find_all('img', src=True)}

   for img_src, base64_img in images.items():
         if base64_img:
            for img in soup.find_all('img', src=img_src):
               img['src'] = base64_img

   title = soup.find('h1')

   article_body = soup.find(class_='article-body')
   pub_date = soup.find(class_='time').text.strip().replace("-", "")
   title = title.text.strip()
   title = sanitize_filename(title)
   html_body = html_template.format(title=title, content=article_body, style=style)
   print(f'访问成功，正在处理：{pub_date} - {title}')
   # 假设图片链接存储在img标签的src属性中
   # 保存修改后的HTML到本地文件
   author = '安全内参'
   file_name = f"[{author}] - {pub_date} - {title}.html"
   year, month, day = pub_date[:4], pub_date, pub_date
   folder_to_save = "D:\\微信文件"
   target_dir = f"{folder_to_save}\{author}\{year}\{month}\{day}"
   os.makedirs(target_dir, exist_ok=True)# 使用exist_ok避免检查目录是否存在
   f_name = os.path.join(target_dir, file_name)
   with open(f_name, 'w', encoding='utf-8') as file:
         file.write(html_body)
except requests.HTTPError as e:
   logging.error(f"请求网页失败，状态码：{response.status_code}")
except Exception as e:
   logging.error(f"发生错误：{e}")

def main() -> None:
"""主函数，控制多线程抓取和进度保存逻辑"""
last_id = load_progress()
with ThreadPoolExecutor(max_workers=5) as executor:
   futures = {executor.submit(scrape_and_embed_images, f'https://www.secrss.com/articles/{id}', last_id): id for id in range(last_id, 90000)}
   for future in futures:
         future.result()# 等待任务完成
         if future.exception():# 检查是否有异常
            logging.error(f"处理ID {futures} 时发生异常")
         save_progress(futures)

if __name__ == '__main__':
main()

iprogramer 发表于 2024-6-26 23:26

把上面的代码复制到kimi里面，让他帮我实现的多线程，居然能直接跑通，大模型太厉害了

long9788523 发表于 2024-6-27 08:07

确实，大模型比你更懂你的代码哦;www

Wapj_Wolf 发表于 2024-6-27 08:16

以后编程只需要懂点基础，剩下的全交给AI{:1_918:}

liuhaigang12 发表于 2024-6-27 08:21

只能针对直接的，个性化那种需求很难

willgoon 发表于 2024-6-27 08:36

简单的需求还是可以的

ciker_li 发表于 2024-6-27 08:46

你是怎么提问的？
能演示过程吗？

页: [1] 2 3

吾爱破解 - 52pojie.cn's Archiver

在通义千问的帮助下，做的一个爬虫脚本，大模型真是越来越厉害