python 多线程爬虫问题

YUX

2016-06-13 16:21:10 +08:00

@JhOOOn 还有知乎好像都想爬知乎也不知道爬完了做什么好像只有一个看知乎还有点意思

likuku

2016-06-13 16:29:45 +08:00

python 多线程因为 GIL 所以，对 CPU 密集型应用没改善，需要等 IO 的，有帮助；
多进程可以用到多核 /多 CPU, 应对 CPU 密集型应用。

practicer

2016-06-13 16:35:19 +08:00

@YUX
@louk78
@geek123
@Jblue
@EchoUtopia

谢谢各位，我再改一下看看，边改边学好欢乐， HOHO

practicer

2016-06-13 16:37:01 +08:00

@JhOOOn 还有爬知乎妹子头像、 1024 ，都是入门爬虫的标准目标啊，因为他们都拥有有价值的数据。

practicer

2016-06-13 16:38:19 +08:00

@likuku 请教大大我的需求是哪种的？

alexapollo

2016-06-13 19:49:53 +08:00

送几个老例子：
Scrapy: 爬取豆瓣书籍 //以及几个简单实例
http://www.oschina.net/code/snippet_1026739_33016
128 进程，图片爬虫，增量更新
http://www.oschina.net/code/snippet_1026739_43930

以及可以戳这里： https://github.com/geekan/scrapy-examples

practicer

2016-06-14 12:42:08 +08:00

@alexapollo
@YUX
@likuku
@geek123
@EchoUtopia
@Jblue

最后我放弃用多线程|多进程改这个爬虫了，还是没弄懂，打算多读一读各位列出的源码。

后面修改了一次爬虫，从逻辑上减少了一轮解析 HTML 的次数，也算是减少了爬取网页的时间：
1.fetchBooks(u'爬虫') 2.exportCsv(bookUrls)
解析页面分页的时候把 book 的详细页和翻页链接一次保存，上一个版本中为了得到他们 urlopen 了两次，比较浪费时间，另外用 global variable 来更新 book 详细页，翻页链接用递归来获取。

# -*- coding: UTF-8 -*-

import os
import re
import time
import json
import random
import urlparse
import unicodecsv as csv
from urllib2 import urlopen
from urllib2 import HTTPError
from bs4 import BeautifulSoup

import logging
logging.basicConfig(filename='douban.log', level=logging.DEBUG)

bookUrls = set()

def fetchBooks(start):
'''递归爬取翻页链接，同时获取该标签下所有书籍的 url'''
first = u'https://book.douban.com/tag/' + start
newPage = findPages(first)
while newPage:
newPage = findPages(newPage)
print 'Scraping books on page {!r} done'.format(newPage)
logging.info('Scraping books on page {!r} done'.format(newPage))
time.sleep(random.randint(1, 10))

def exportCsv(books):
'''写书籍详细信息到 csv 文件'''
data = (download(book) for book in books)
with open(os.path.join(os.path.dirname(__file__), 'books.csv'), 'wb') as f:
# with open('books.csv', 'wb') as f:
writer = csv.writer(f)
headers = (u'书名', u'原书名', u'出版日期', u'页数',
u'豆瓣评分', u'评价人数', u'ISBN', u'网址', u'TOP 评论')
writer.writerow(headers)
for line in data:
writer.writerow(line)
print 'Saving the book {} done'.format(line[6])
logging.info('Saving the book {} done'.format(line[6]))
time.sleep(random.randint(1, 10))
print 'Saving ALL done'
logging.info('Saving ALL done')

def findPages(pageUrl):
'''解析豆瓣图书分页 html ，获取翻页按钮链接，每页一个链接'''
html = urlopen(iriToUri(pageUrl))
bsObj = BeautifulSoup(html)
linkEle = bsObj.find('link', {'rel': 'next'})
if linkEle is not None:
if 'href' in linkEle.attrs:
findBooks(bsObj)
return u'https://book.douban.com' + linkEle.attrs['href']

def findBooks(bsObj):
'''解析豆瓣图书分页 html ，获取书籍详细页链接，每页 20 个链接'''
global bookUrls
books = bsObj.findAll('a', {'class': 'nbg'})
try:
if books is not None:
for book in books:
if 'href' in book.attrs and book.attrs['href'] not in bookUrls:
print 'Found new book: {}'.format(book.attrs['href'])
logging.info('Found new book: {}'.format(book.attrs['href']))
bookUrls.add(book.attrs['href'])
return bookUrls
except Exception as e:
print e.message
logging.exception('{}'.format(e))

def urlEncodeNonAscii(b):
"""将 non-ascii 转成 ascii 字符"""
return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b)

def iriToUri(iri):
"""打开带中文的网址，将 iri 转为 uri ，"""
parts = urlparse.urlparse(iri)
return urlparse.urlunparse(
part.encode('idna') if parti == 1 else urlEncodeNonAscii(part.encode('utf-8'))
for parti, part in enumerate(parts)
)

def getFullReview(reviewId):
'''抓包解析 review 内容'''
url = 'https://book.douban.com/j/review/' + str(reviewId) + '/fullinfo'
try:
html = json.loads(urlopen(url).read())['html']
except HTTPError as e :
print e.message
logging.error('Error: {}'.format(e))
return None
fullReview = re.search('.*(?=<div)', html).group()
if fullReview is not None:
return fullReview

def download(bookUrl):
'''解析书籍详细页'''
html = urlopen(bookUrl)
bsObj = BeautifulSoup(html)

try:
isbn = bsObj.find(id='info').find(
text=re.compile('(\d{10})|(\d{13})')).strip()
except AttributeError as e:
print e.message
logging.exception('{}'.format(e))
isbn = ''

try:
publishY = bsObj.find(id='info').find(
text=re.compile('\d{4}-\d{1,2}(-\d{1,2})?')).strip()
except AttributeError as e:
print e.message
logging.exception('{}'.format(e))
publishY = ''

try:
pageNum = bsObj.find(id='info').find(
text=re.compile('^\s\d{3,4}$')).strip()
except AttributeError as e:
print e.message
logging.exception('{}'.format(e))
pageNum = ''

try:
origName = bsObj.find(id='info').find(text=u'原作名:')
if origName is not None:
origName = bsObj.find(id='info').find(
text=u'原作名:').parent.next_sibling.strip()
except AttributeError as e:
print e.message
logging.exception('{}'.format(e))
origName = ''

try:
rating = bsObj.find(
'strong', {'class': 'll rating_num '}).get_text().strip()
except AttributeError as e:
print e.message
logging.exception('{}'.format(e))
rating = ''

try:
numRating = bsObj.find(
'span', {'property': 'v:votes'}).get_text()
except AttributeError as e:
print e.message
logging.exception('{}'.format(e))
numRating = ''

try:
reviewId = bsObj.find(
'div', {'id': re.compile(r'tb-(\d+)')}).attrs['id'][3:]
review = getFullReview(reviewId)
except AttributeError as e:
print e.message
logging.exception('{}'.format(e))
review = ''
title = bsObj.find('span', {'property': 'v:itemreviewed'}).get_text()
addr = bookUrl
return (title, origName, publishY, pageNum, rating,
numRating, isbn, addr, review)

if __name__ == '__main__':
print 'Starting at: {}'.format(time.ctime())
logging.info('Starting at: {}'.format(time.ctime()))
fetchBooks(u'股票')
exportCsv(bookUrls)
print 'All finished at: {}'.format(time.ctime())
logging.info('All finished at: {}'.format(time.ctime()))

EchoUtopia

2016-06-14 12:55:40 +08:00

@practicer 爬虫最好用异步，这有一篇教程，用 python3 异步模块编写爬虫，真的很经典
http://aosabook.org/en/500L/a-web-crawler-with-asyncio-coroutines.html

practicer

2016-06-14 14:18:10 +08:00

@EchoUtopia 好难懂，我慢慢啃吧，谢谢分享。

EchoUtopia

2016-06-14 15:48:30 +08:00

@practicer 可以直接看代码，结合着 python 的 asyncio 模块文档，很快的
https://github.com/aosabook/500lines/blob/master/crawler/code/crawling.py

practicer

2016-06-28 15:24:22 +08:00

这段时间一直在熟悉 scrapy ，得知它由异步框架 twisted 搭建的，并且用 scrapy 对比自己写的爬虫，深深感受到 scrapy 异步回调的威力。

爬虫的正确姿势是异步编程。推荐一个讲解异步模型（ twisted 框架）的电子书，从浅到深介绍如何将同步程序重构成异步非阻塞程序 https://www.gitbook.com/book/likebeta/twisted-intro-cn/details

该书第 17 章----生成器实现的异步方式，便是 scrapy 中最常使用的方法了 https://likebeta.gitbooks.io/twisted-intro-cn/content/zh/p17.html 。还有 @EchoUtopia 推荐的文章中介绍的的 asyncio 模块，都是正确的爬虫姿势。

nik

2016-11-16 15:17:22 +08:00

@practicer 不知你是否在北京？我们公司需要爬虫工程师

practicer

2016-11-21 16:25:54 +08:00

@nik 我在十八线省会城市, 不在北京