为什么爬一些简单的福利站总能引起巨大关注量?

ryd994

2017-11-07 02:08:28 +08:00

重点在福利而不在简单
你要有能力爬复杂的福利站，一样有流量

zjlin1984

2017-11-07 08:24:15 +08:00

@ryd994 这个见解比较好。

fish19901010

2017-11-07 09:25:26 +08:00

重点在福利不在简单+1，我相信就算人工保存下来过来发帖，一样能够很高人气。。。关键在于分享的快乐，以及聚众看片的那种感觉。

holajamc

2017-11-07 09:34:51 +08:00

@sola97 嗯我也没有细细看过，不如抽出来关键帧去阿里鉴别一下？

yuxuan

2017-11-07 09:55:11 +08:00

最大的乐趣还是 “卧槽我在办公室直接打开了 XXX 就在我旁边”😂

jijiwaiwai

2017-11-07 10:55:04 +08:00

#encoding=utf8
import re
import os
import urllib
import requests
import download_progress
from pyquery import PyQuery

httplib = requests.Session()
# httplib.proxies = {
# "http": "127.0.0.1:1080",
# "https": "127.0.0.1:1080",
# }
httplib.proxies = None

BASE_URL = "http://www.91porn.com"
# BASE_URL = "http://email.91dizhi.at.gmail.com.7h4.space"
HOST = BASE_URL.replace("http://", "")
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, sdch",
"Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Cookie": "__cfduid=db012482c270fa1f5ded2903a6e23bc7c1489314005; CLIPSHARE=mkfverli1mp659s49rklg7s4c6; watch_times=1; evercookie_cache=undefined; evercookie_etag=undefined; show_msg=3; 91username=dfdsdfdgfgdfgf; DUID=51a2l
wwqvsbLKLuC8GzQhovCxkJIoz3nOTQ3cXXjR4w31%2FFE; USERNAME=18f7vcdO6LmTdVWCgMCD1L0IVQqGLjL0BWT%2FxBYxbXDXzFxQRSnfKsKTVg; user_level=1; EMAILVERIFIED=no; level=1; __utma=69831812.144777271.1489314006.1489314006.1489322790.2; __utmb=69
831812.0.10.1489322790; __utmc=69831812; __utmz=69831812.1489314006.1.1.utmcsr=91dizhi.space|utmccn=(referral)|utmcmd=referral|utmcct=/; AJSTAT_ok_pages=8; AJSTAT_ok_times=2",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
}

def get_vids_v1():
url = "%s/v.php?category=top&viewtype=basic" %BASE_URL
data = httplib.get(url, headers=headers).content
DOM = PyQuery(data)
a_list = DOM("div.listchannel a")
urls = []
for i in range(len(a_list)):
href = a_list.eq(i).attr("href")
if "viewkey" in href:
urls.append(href)
# print href

urls = list(set(urls))
# print "\n".join(urls)
return urls

def get_vids():
url = "%s/index.php" %BASE_URL
data = httplib.get(url, headers=headers).content
DOM = PyQuery(data)
a_list = DOM("div#tab-featured a")
urls = []
for i in range(len(a_list)):
href = a_list.eq(i).attr("href")
if "viewkey" in href:
urls.append(href)
# print href

urls = list(set(urls))
# print "\n".join(urls)
return urls

def get_video_url_v1(url):
data = httplib.get(url, headers=headers, timeout=10).content
data = data.replace("\n", "")
data = data.replace(" ", "")
# print data
"""
<title> xxxx-Chinese homemade video</title>
so.addVariable('file','201812');
so.addVariable('max_vid','202377');
so.addVariable('seccode' , '842fa039535238905a93ccb961e21183');
"""
# print re.findall("so.addVariable.+?so.write", data)

title = re.findall("<title>(.+?)-(.+?)</title>", data)[0]
VID = re.findall("so.addVariable\('file','(\d+)'\);", data)[0]
max_vid = re.findall("so.addVariable\('max_vid','(\d+)'\);", data)[0]
seccode = re.findall("so.addVariable\('seccode','(.+?)'\);", data)[0]

getfile_url = "%s/getfile.php?VID=%s&mp4=0&seccode=%s&max_vid=%s" %(BASE_URL, VID, seccode, max_vid)
# print getfile_url
"""
file=http%3A%2F%2F192.240.120.2%2Fmp43%2F202132.mp4%3Fst%3DeGVzsftsOLn7pxPgdeV-dg%26e%3D1489306723&domainUrl=http://91porn.ro.lt&imgUrl=http://img.file.am/91porn/>
"""
headers["Referer"] = url
headers["X-Requested-With"] = "ShockwaveFlash/23.0.0.207"
headers["Accept"] = "*/*"
headers["Host"] = "*/*"
headers["Connection"] = "keep-alive"

data = httplib.get(getfile_url, headers=headers, timeout=10).content
# print data.strip()
video_url = data.split("&domainUrl=")[0].split("file=")[-1]
video_url = urllib.unquote(video_url)
# print video_url
print VID, title, video_url
return VID, video_url

def get_video_url(url):
data = httplib.get(url, headers=headers, timeout=10).content
data = data.replace("\n", "")
data = data.replace("\r", "")
video_url = re.findall('<source src="(.+?)"', data)[0]
title = re.findall('<div id="viewvideo-title">(.+?)</div>', data)[0]
# print video_url
print title.decode("utf8")
print video_url
return title, video_url

def download_video(video_url, file_name):
if os.path.exists(file_name):
return True

conn = urllib.urlopen(video_url)
data = download_progress.chunk_read(conn, report_hook=download_progress.chunk_report)
f = open(file_name, "wb")
f.write(data)
f.close()

urls = get_vids()
for url in urls:
print "=" * 60
print url
try:
title, video_url = get_video_url(url)
title = title.replace("/", "-")
title = title.replace("?", "-")
title = title.replace(":", "-")
title = title.replace("\\", "-")
title = title.replace("(", "")
title = title.replace(")", "")
title = title.replace("&", "-")
title = title.replace(" ", "-")
title = title.replace("\"", "")
title = title.replace("'", "")

file_name = "downloads/%s.mp4" %(title)
# file_name = "%s.mp4" %(url.split("viewkey=")[-1].split("&")[0])
# print video_url
download_video(video_url, file_name)

os.system("echo 11111 >> %s" %file_name)

except Exception as e:
# print e
pass

ic3z

2017-11-07 11:01:55 +08:00

为什么讨论爬福利站的帖子的帖子也能引起巨大关注量

sola97

2017-11-07 12:27:51 +08:00

@holajamc #64 阿里要是能鉴别是欧美的还是亚洲的就好了哈哈

holajamc

2017-11-07 14:22:41 +08:00

@sola97 我觉得根据音频嗯说不定可以（滑稽.png

LastingTime

2017-11-08 11:50:49 +08:00

@holajamc six six six

holajamc

2017-11-08 12:29:51 +08:00

@LastingTime 设想是这样的根据音频识别文字然后 testrank 算出来关键文本最后 ngram 推测语言 2333

wangfei324017

2017-12-05 10:27:22 +08:00

@tdstevelx333 #49 https://github.com/dixudx/tumblr-crawler

5ipapa

2022-12-21 13:05:17 +08:00

嘟嘟嘟发车了
t.9217lu.com/t8k2c8
收藏不迷路