Last active
October 19, 2017 04:53
-
-
Save mopemope/5464814 to your computer and use it in GitHub Desktop.
2ch crawler prototype
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import requests | |
from pyquery import PyQuery as pq | |
import parser | |
import re | |
import datastore | |
url_re = re.compile(".*/(\d+)/.*", re.M) | |
MENU_URL = "http://menu.2ch.net/bbsmenu.html" | |
def get_board_list(): | |
res = requests.get(MENU_URL) | |
if res.status_code == 200: | |
data = res.content | |
u = data.decode("cp932", "ignore") | |
return parser.parse_menu(u) | |
# return parser.parse_menu(data) | |
def get_thread_list(board_nm, board_url): | |
url = board_url + "subback.html" | |
# print(url) | |
res = requests.get(url) | |
if res.status_code == 200: | |
data = res.content | |
u = data.decode("cp932", "ignore") | |
return parser.parse_thread_list(board_url, u) | |
def get_thread_data(board_url, url, nm): | |
m = url_re.search(url) | |
bg20 = 'http://bg20.2ch.net/test/r.so/' | |
if m: | |
index = m.group(1) | |
bgurl = "%s%s%s/" % (bg20, board_url[7:], index) | |
res = requests.get(bgurl) | |
if res.status_code == 200: | |
data = res.content | |
u = data.decode("cp932", "ignore") | |
if u.find("ERROR = 5656") == -1: | |
return parser.parse_thread(board_url, url, nm, u) | |
import concurrent.futures | |
def crawle_thread(board_url, name, url, rescount): | |
# print(name) | |
try: | |
dats = get_thread_data(board_url, url, name) | |
if dats: | |
old_count = datastore.insert_thread(board_url, name, url, rescount) | |
if old_count >= 0: | |
dats = dats[old_count:] | |
datastore.insert_dat(dats) | |
else: | |
print("譁ー逹縺ェ縺 %s" % name) | |
else: | |
print("bg20 is dead. %s" % name) | |
except: | |
import traceback | |
print(board_url) | |
print(traceback.format_exc()) | |
def exec_crawle(tls): | |
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executer: | |
l = [executer.submit(crawle_thread, board_url, name, url, rescount) for board_url, name, url, rescount in tls] | |
results = concurrent.futures.wait(l) | |
for result in results.done: | |
result.result() | |
def crawle(board_nm, board_url): | |
print(board_url) | |
tls = get_thread_list(board_nm, board_url) | |
if tls: | |
exec_crawle(tls) | |
def run(): | |
bl = get_board_list() | |
datastore.insert_boards(bl) | |
bl = datastore.get_boards() | |
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executer: | |
l = [executer.submit(crawle, d["name"], d["url"]) for d in bl] | |
results = concurrent.futures.wait(l) | |
for result in results.done: | |
result.result() | |
while True: | |
run() | |
import time | |
time.sleep(60 * 15) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from pymongo import Connection, ASCENDING | |
import hashlib | |
MAIN = 'main' | |
DAT = 'dat' | |
BOARD_LIST = 'board_list' | |
THREAD_LIST = 'thread_list' | |
THREAD = 'thread' | |
def get_connection(host='localhost', port=27017): | |
return Connection(host, port) | |
def get_board_collection(conn=None): | |
if not conn: | |
conn = get_connection() | |
db = conn[MAIN] | |
return db[BOARD_LIST] | |
def get_boards(): | |
conn = get_connection() | |
db = conn[MAIN] | |
lst = db[BOARD_LIST] | |
conn.close() | |
return lst.find() | |
def get_thread_list_collection(conn=None): | |
if not conn: | |
conn = get_connection() | |
db = conn[MAIN] | |
return db[THREAD_LIST] | |
def get_thread_collection(conn=None): | |
if not conn: | |
conn = get_connection() | |
db = conn[DAT] | |
tl = db[THREAD] | |
return tl | |
def insert_boards(bl): | |
c = get_connection() | |
collection = get_board_collection(c) | |
for nm, url in bl.items(): | |
collection.insert(dict(url=url, name=nm)) | |
c.close() | |
def insert_board(nm, url): | |
c = get_connection() | |
collection = get_board_collection(c) | |
collection.insert(dict(url=url, name=nm)) | |
c.close() | |
def insert_thread(board_url, name, url, rescount): | |
query = {"url" : url} | |
c = get_connection() | |
collection = get_thread_list_collection(c) | |
r = collection.find_one(query) | |
if not r: | |
# New! | |
old_count = 0 | |
else: | |
old_count = r.get("rescount") | |
try: | |
if rescount > old_count: | |
print("%s incoming %s" % (name, (rescount - old_count))) | |
d = dict(board_url=board_url, name=name, url=url, rescount=rescount) | |
query = {"url" : url} | |
r = collection.update(query, {"$set": d}, upsert=True) | |
return old_count | |
else: | |
return -1 | |
finally: | |
c.close() | |
def insert_dat(dats): | |
c = get_connection() | |
collection = get_thread_collection(c) | |
for dat in dats: | |
r = collection.insert(dat) | |
print("%s OK" % dat.get("thread_nm")) | |
c.close() | |
def setup(): | |
c = get_connection() | |
db = c[MAIN] | |
bl = db[BOARD_LIST] | |
bl.create_index("url", unique=True) | |
tl = db[THREAD_LIST] | |
r = tl.index_information() | |
if not r: | |
tl.create_index("board_url") | |
tl.create_index("url") | |
tl.create_index("name") | |
db = c[DAT] | |
tl = db[THREAD] | |
r = tl.index_information() | |
if not r: | |
tl.create_index("board_url") | |
tl.create_index("url") | |
tl.create_index("no") | |
tl.create_index("comment") | |
c.close() | |
def thread_nm_find(nm): | |
import re | |
c = get_connection() | |
db = c[MAIN] | |
tl = db[THREAD_LIST] | |
res = tl.find({"name" : re.compile(nm)}) | |
return res |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import codecs | |
import re | |
from pyquery import PyQuery as pq | |
from urllib.parse import urljoin | |
url_re = re.compile("(.*read.cgi/\w+/\d+/).*", re.M) | |
res_re = re.compile(".*\((\d+)\).*", re.M) | |
title_re = re.compile("\d+:\s+(.*)\((\d+)\)", re.M) | |
def parse_menu(data): | |
result = dict() | |
q = pq(data) | |
for anchor in q("a"): | |
a = pq(anchor) | |
v = a.text() | |
href = a.attr.href | |
if not href.startswith('http'): | |
continue | |
if href.endswith("bbsmenu"): | |
continue | |
if not href.endswith("php") and not href.endswith(".net/") and not href.endswith(".jp/"): | |
result[v] = href | |
return result | |
def parse_thread_list(board_url, data): | |
result = [] | |
base_url = None | |
q = pq(data) | |
for base in q("base"): | |
base = pq(base) | |
base_url = base.attr.href | |
if not base_url: | |
return | |
for anchor in q("a"): | |
a = pq(anchor) | |
v = a.text() | |
href = a.attr.href | |
url = urljoin(base_url, href) | |
if url.endswith("50"): | |
# print(v, url) | |
match = title_re.search(v) | |
if match: | |
title = match.group(1) | |
res_cnt = match.group(2) | |
result.append((board_url, title.strip(), url[:-3], int(res_cnt))) | |
# else: | |
# print("***** " + url) | |
return result | |
def parse_thread(board_url, url, thread_nm, data): | |
data = data.split("\n") | |
i = 1 | |
result = [] | |
for res in data: | |
cols = res.split("<>") | |
if len(cols) > 4: | |
d = dict() | |
d["board_url"] = board_url | |
d["url"] = url | |
d["thread_nm"] = thread_nm | |
d["hndl"] = cols[0] | |
d["mailto"] = cols[1] | |
d["date"] = cols[2] | |
d["comment"] = cols[3] | |
d["other"] = cols[4] | |
d["no"] = i | |
result.append(d) | |
i += 1 | |
return result | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment