为什么爬一些简单的福利站总能引起巨大关注量?

2017-11-06 10:14:44 +08:00
 LastingTime
没有贬低的意思, 只是好奇一从些简单的公开的网站上爬下来些妹子图、福利图等
然后发在 V2EX 就这么受欢迎和关注..
尤其是程序员版块, 这不是稍微会点 HTTP 相关技能就能做到的事情吗?
11472 次点击
所在节点    程序员
73 条回复
ryd994
2017-11-07 02:08:28 +08:00
重点在福利而不在简单
你要有能力爬复杂的福利站,一样有流量
zjlin1984
2017-11-07 08:24:15 +08:00
@ryd994 这个见解比较好。
fish19901010
2017-11-07 09:25:26 +08:00
重点在福利不在简单+1,我相信就算人工保存下来过来发帖,一样能够很高人气。。。关键在于分享的快乐,以及聚众看片的那种感觉。
holajamc
2017-11-07 09:34:51 +08:00
@sola97 嗯我也没有细细看过,不如抽出来关键帧去阿里鉴别一下?
yuxuan
2017-11-07 09:55:11 +08:00
最大的乐趣还是 “卧槽 我在办公室直接打开了 XXX 就在我旁边”😂
jijiwaiwai
2017-11-07 10:55:04 +08:00
#encoding=utf8
import re
import os
import urllib
import requests
import download_progress
from pyquery import PyQuery

httplib = requests.Session()
# httplib.proxies = {
# "http": "127.0.0.1:1080",
# "https": "127.0.0.1:1080",
# }
httplib.proxies = None

BASE_URL = "http://www.91porn.com"
# BASE_URL = "http://email.91dizhi.at.gmail.com.7h4.space"
HOST = BASE_URL.replace("http://", "")
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, sdch",
"Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Cookie": "__cfduid=db012482c270fa1f5ded2903a6e23bc7c1489314005; CLIPSHARE=mkfverli1mp659s49rklg7s4c6; watch_times=1; evercookie_cache=undefined; evercookie_etag=undefined; show_msg=3; 91username=dfdsdfdgfgdfgf; DUID=51a2l
wwqvsbLKLuC8GzQhovCxkJIoz3nOTQ3cXXjR4w31%2FFE; USERNAME=18f7vcdO6LmTdVWCgMCD1L0IVQqGLjL0BWT%2FxBYxbXDXzFxQRSnfKsKTVg; user_level=1; EMAILVERIFIED=no; level=1; __utma=69831812.144777271.1489314006.1489314006.1489322790.2; __utmb=69
831812.0.10.1489322790; __utmc=69831812; __utmz=69831812.1489314006.1.1.utmcsr=91dizhi.space|utmccn=(referral)|utmcmd=referral|utmcct=/; AJSTAT_ok_pages=8; AJSTAT_ok_times=2",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
}

def get_vids_v1():
url = "%s/v.php?category=top&viewtype=basic" %BASE_URL
data = httplib.get(url, headers=headers).content
DOM = PyQuery(data)
a_list = DOM("div.listchannel a")
urls = []
for i in range(len(a_list)):
href = a_list.eq(i).attr("href")
if "viewkey" in href:
urls.append(href)
# print href

urls = list(set(urls))
# print "\n".join(urls)
return urls

def get_vids():
url = "%s/index.php" %BASE_URL
data = httplib.get(url, headers=headers).content
DOM = PyQuery(data)
a_list = DOM("div#tab-featured a")
urls = []
for i in range(len(a_list)):
href = a_list.eq(i).attr("href")
if "viewkey" in href:
urls.append(href)
# print href

urls = list(set(urls))
# print "\n".join(urls)
return urls

def get_video_url_v1(url):
data = httplib.get(url, headers=headers, timeout=10).content
data = data.replace("\n", "")
data = data.replace(" ", "")
# print data
"""
<title> xxxx-Chinese homemade video</title>
so.addVariable('file','201812');
so.addVariable('max_vid','202377');
so.addVariable('seccode' , '842fa039535238905a93ccb961e21183');
"""
# print re.findall("so.addVariable.+?so.write", data)

title = re.findall("<title>(.+?)-(.+?)</title>", data)[0]
VID = re.findall("so.addVariable\('file','(\d+)'\);", data)[0]
max_vid = re.findall("so.addVariable\('max_vid','(\d+)'\);", data)[0]
seccode = re.findall("so.addVariable\('seccode','(.+?)'\);", data)[0]

getfile_url = "%s/getfile.php?VID=%s&mp4=0&seccode=%s&max_vid=%s" %(BASE_URL, VID, seccode, max_vid)
# print getfile_url
"""
file=http%3A%2F%2F192.240.120.2%2Fmp43%2F202132.mp4%3Fst%3DeGVzsftsOLn7pxPgdeV-dg%26e%3D1489306723&domainUrl=http://91porn.ro.lt&imgUrl=http://img.file.am/91porn/>
"""
headers["Referer"] = url
headers["X-Requested-With"] = "ShockwaveFlash/23.0.0.207"
headers["Accept"] = "*/*"
headers["Host"] = "*/*"
headers["Connection"] = "keep-alive"

data = httplib.get(getfile_url, headers=headers, timeout=10).content
# print data.strip()
video_url = data.split("&domainUrl=")[0].split("file=")[-1]
video_url = urllib.unquote(video_url)
# print video_url
print VID, title, video_url
return VID, video_url

def get_video_url(url):
data = httplib.get(url, headers=headers, timeout=10).content
data = data.replace("\n", "")
data = data.replace("\r", "")
video_url = re.findall('<source src="(.+?)"', data)[0]
title = re.findall('<div id="viewvideo-title">(.+?)</div>', data)[0]
# print video_url
print title.decode("utf8")
print video_url
return title, video_url


def download_video(video_url, file_name):
if os.path.exists(file_name):
return True

conn = urllib.urlopen(video_url)
data = download_progress.chunk_read(conn, report_hook=download_progress.chunk_report)
f = open(file_name, "wb")
f.write(data)
f.close()



urls = get_vids()
for url in urls:
print "=" * 60
print url
try:
title, video_url = get_video_url(url)
title = title.replace("/", "-")
title = title.replace("?", "-")
title = title.replace(":", "-")
title = title.replace("\\", "-")
title = title.replace("(", "")
title = title.replace(")", "")
title = title.replace("&", "-")
title = title.replace(" ", "-")
title = title.replace("\"", "")
title = title.replace("'", "")

file_name = "downloads/%s.mp4" %(title)
# file_name = "%s.mp4" %(url.split("viewkey=")[-1].split("&")[0])
# print video_url
download_video(video_url, file_name)

os.system("echo 11111 >> %s" %file_name)

except Exception as e:
# print e
pass
ic3z
2017-11-07 11:01:55 +08:00
为什么讨论爬福利站的帖子的帖子也能引起巨大关注量
sola97
2017-11-07 12:27:51 +08:00
@holajamc #64 阿里要是能鉴别是欧美的还是亚洲的就好了哈哈
holajamc
2017-11-07 14:22:41 +08:00
@sola97 我觉得根据音频嗯说不定可以(滑稽.png
LastingTime
2017-11-08 11:50:49 +08:00
@holajamc six six six
holajamc
2017-11-08 12:29:51 +08:00
@LastingTime 设想是这样的根据音频识别文字然后 testrank 算出来关键文本最后 ngram 推测语言 2333
wangfei324017
2017-12-05 10:27:22 +08:00
5ipapa
2022-12-21 13:05:17 +08:00
嘟嘟嘟 发车了
t.9217lu.com/t8k2c8
收藏不迷路

这是一个专为移动设备优化的页面(即为了让你能够在 Google 搜索结果里秒开这个页面),如果你希望参与 V2EX 社区的讨论,你可以继续到 V2EX 上打开本讨论主题的完整版本。

https://www.v2ex.com/t/403833

V2EX 是创意工作者们的社区,是一个分享自己正在做的有趣事物、交流想法,可以遇见新朋友甚至新机会的地方。

V2EX is a community of developers, designers and creative people.

© 2021 V2EX