使用 selenium+selenium 尝试抓取天猫的 cookie,在 windows 环境下抓取无异常,但部署到 Linux 服务器上时, chromedriver.exe 残留进程一直未被回收,请诸位大神给分析下原因。

2018-06-11 12:35:45 +08:00
 William55555

#coding:utf-8 import random import re import os import time import threading import Queue import traceback import logging from threading import Thread from selenium import webdriver

lock = threading.Lock()

def async(f): """异步装饰器""" def wrapper(*args, **kwargs): thr = Thread(target=f, args=args, kwargs=kwargs) thr.start() return wrapper

class TmallCookie(object): def init(self): # cookie 队列 self.cookie_queue = Queue.Queue() self.cookie_list = list() self.load_cookie() self.parse_cookie() self.save_cookie()

def load_cookie(self):
    """加载本地已保存的 cookie"""
    lock.acquire()
    with open("tmall_cookie.txt", "r") as f:
        cookie_list = f.readlines()
    lock.release()
    for i in cookie_list:
        self.cookie_queue.put(i.strip())

@async
def parse_cookie(self):
    """
    请求 cookie,并将 cookie 保存至 cookie 列表
    :return: 
    """
    urls = ['https://detail.tmall.com/item.htm?id=562345301295',
            'https://detail.tmall.com/item.htm?id=553941537843',
            'https://detail.tmall.com/item.htm?id=558646979307',
            'https://list.tmall.com/search_product.htm?spm=a221t.1812074.2005984841.8.44d84208RXceJT&q=%B9%E2%C3'
            '%E6%CE%C4%D0%D8&from=.list.pc_1_searchbutton&acm=lb-zebra-7777-1443323.1003.4.1158540&type=p&scm=100'
            '3.4.lb-zebra-7777-1443323.OTHER_14748278648600_1158540',
            'https://list.tmall.com/spu_detail.htm?fmtab=sp&cat=50105508&spuid=877471268&suid=4e5fd39570486fdf2a'
            '9b3077572be7ab&rn=1e0abfcf6995e918ab6c7bc00d6e9be2'
            ]
    option = webdriver.ChromeOptions()
    option.add_argument('disable-infobars')
    option.add_argument('disable-gpu')
    option.add_argument('--headless')
    option.add_argument("--no-sandbox")
    # option.add_argument("window-size=1024,768")
    while True:
        if self.cookie_queue.qsize() < 20000:
            try:
                # driver = webdriver.Chrome('C:\\chromedriver.exe', chrome_options=option)
                driver = webdriver.Chrome('./chromedriver', chrome_options=option)
                driver.set_page_load_timeout(120)
                url = random.choice(urls)
                driver.get(url)
                time.sleep(5)
                try:
                    cookies = driver.get_cookies()
                    cookie_string = []
                    for cookie_info in cookies:
                        cookie_string.append(u'%s=%s' % (cookie_info.get(u'name'), cookie_info.get(u'value')))
                    cookie_string = '; '.join(cookie_string)
                    driver.close()
                    driver.quit()
                except Exception as e:
                    pass
                try:
                    _tb_token_ = re.findall("(_tb_token_=.*?;)", cookie_string)[0]
                    t = re.findall("(t=[a-z0-9]+)", cookie_string)[0]
                    cna = re.findall("(cna=.*?;)", cookie_string)[0]
                    cookie2 = re.findall("(cookie2=.*?;)", cookie_string)[0].replace(";", "")
                    cookie = _tb_token_ + " " + t + "; " + cna + " " + cookie2
                    try:
                        enc = re.findall("(enc=.*?;)", cookie_string)[0]
                        cookie = _tb_token_ + " " + t + "; " + cna + " " + enc + " " + cookie2
                    except:
                        pass
                    print cookie
                    self.cookie_queue.put(cookie)
                    lock.acquire()
                    self.cookie_list.append(cookie)
                    lock.release()
                except Exception as e:
                    pass
            except Exception as e:
                print traceback.format_exc()
        else:
            time.sleep(300)

@async
def save_cookie(self):
    """
    清空之前的 cookie 文件,将当天抓取的 cookie 保存至文件
    :return: 
    """
    while True:
        time.sleep(1)
        if len(self.cookie_list) > 10:
            lock.acquire()
            # with open("tmall_cookie.txt", "w") as f1:
            #     f1.truncate()
            #     time.sleep(5)
            with open("tmall_cookie.txt", "a") as f2:
                for cookie in self.cookie_list:
                    f2.write(cookie)
                    f2.write("\n")
                self.cookie_list = []
            lock.release()

def get_cookie(self):
    """
    获取一个 cookie
    :return: tmall cookie
    """
    while True:
        try:
            cookie = self.cookie_queue.get(timeout=5)
            break
        except Exception as e:
            logging.warning("Get cookie error: %s" % e)
            time.sleep(5)
    if self.cookie_queue.qsize() <= 5000:
        self.cookie_queue.put(cookie)
    return cookie

if name == 'main': cookie = TmallCookie() # while True: # print cookie.get_cookie() # time.sleep(2)

1937 次点击
所在节点    问与答
2 条回复
a7a2
2018-06-11 13:31:04 +08:00
按照经验不是你代码的问题 而是你用的 webdriver 库跟 chromedriver 之间的问题 同样在 macOS 下也有这个问题
可以尝试调用 kill 之类结束它,就是自己管理
1109599636
2018-07-07 17:31:43 +08:00
我以前写的时候是换的火狐的驱动....

这是一个专为移动设备优化的页面(即为了让你能够在 Google 搜索结果里秒开这个页面),如果你希望参与 V2EX 社区的讨论,你可以继续到 V2EX 上打开本讨论主题的完整版本。

https://www.v2ex.com/t/462122

V2EX 是创意工作者们的社区,是一个分享自己正在做的有趣事物、交流想法,可以遇见新朋友甚至新机会的地方。

V2EX is a community of developers, designers and creative people.

© 2021 V2EX