正好手里有个多进程的例子。如果线程间数据独立,最好用多进程,python的多线程跟闹着玩似的:
[Python] 纯文本查看 复制代码 import os
from multiprocessing.pool import Pool
import pymongo
from drivers import firefox, refresh_page, get_header_detail, click_each_url, if_new_window_is_opened, \
if_detail_page_loaded, get_overview_detail
def get_products_in_each_page(page):
browser = firefox()
browser.get(
f'https://www.alibaba.com/products/Schisandra_Chinensis.html?spm=a2700.galleryofferlist.0.0.2a1363ffJcGqtX&IndexArea=product_en&page={page}')
refresh_page(browser)
products = browser.find_elements_by_class_name('organic-list-offer-inner')
items = []
for product in products:
item = {}
get_header_detail(product, item)
main_handles = browser.window_handles
href = click_each_url(product)
if not if_new_window_is_opened(browser, main_handles):
continue
browser.switch_to.window(browser.window_handles[-1])
if not if_detail_page_loaded(browser):
continue
get_overview_detail(browser, item, href)
browser.close()
browser.switch_to.window(browser.window_handles[0])
items.append(item)
print(item)
browser.quit()
return items
def get_products(page):
client = pymongo.MongoClient('172.17.0.2', 27017)
db = client['alibaba']
db.authenticate('root', 'a')
wwz = db['wwz']
items = get_products_in_each_page(page)
wwz.insert_many(items)
if __name__ == '__main__':
print('Parent process %s.' % os.getpid())
last_page = 37
pool = Pool(2)
for i in range(last_page):
pool.apply_async(get_products, args=(i + 1,))
print('Waiting for all subprocesses done...')
pool.close()
pool.join()
print('All subprocesses done.') |