首先,Selenium已经在创建一个进程,因此使用多线程而不是多处理要好得多,因为每个线程都将启动一个进程。还有,在
scrape_urls
在你的
driver = webdriver.Chrome(driver_dir)
语句,函数的其余部分应包含在
试试/最后
声明
最后
块包含
driver.quit()
以确保无论是否存在异常,驱动程序进程都会终止。现在,所有驱动程序进程都在运行。
您还可以考虑使用以下技术来创建大小为4的线程池(或者取决于要处理多少URL),但是池中的每个线程自动重用已分配给线程的驱动程序,该线程保存在线程本地存储中。您可能希望更改用于创建驱动程序的选项(当前为“无头”模式):
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from multiprocessing.pool import ThreadPool
import threading
import gc
threadLocal = threading.local()
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_experimental_option('excludeSwitches', ['enable-logging'])
self.driver = webdriver.Chrome(options=options)
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
print('The driver has been "quitted".')
@classmethod
def create_driver(cls):
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver is None:
print('Creating new driver.')
the_driver = cls()
threadLocal.the_driver = the_driver
driver = the_driver.driver
the_driver = None
return driver
def scraper(url):
"""
This now scrapes a single URL.
"""
driver = Driver.create_driver()
driver.get(url)
element = WebDriverWait(driver, timeout=7).until(lambda d: d.find_element(by=By.CLASS_NAME, value="InterestingData"))
print("got data from: ", url)
return element.text
with open('array_of_urls', 'r') as infile:
urls = json.load(infile)
number_of_processes = min(4, len(urls))
with ThreadPool(processes=number_of_processes) as pool:
result_array = pool.map(scraper, urls)
# Must ensure drivers are quitted before threads are destroyed:
del threadLocal
# This should ensure that the __del__ method is run on class Driver:
gc.collect()
pool.close()
pool.join()