python Facebook帖子爬虫
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python Facebook帖子爬虫相关的知识,希望对你有一定的参考价值。
from bs4 import BeautifulSoup
from selenium import webdriver
import garble
import json, time, threading
class FacebookCrawler:
""" A crawler for extracting facebook posts
"""
def __init__(self, name, url, options = {}):
self.name = name
self.url = url
self.interval = 5
self.output_dir = '/tmp/'
if 'interval' in options:
self.interval = options['interval']
if 'output' in options:
self.output_dir = options['output']
def run(self):
threading.Thread(target=self.worker).start()
def worker(self):
"""
The worker process for crawler job
"""
browser = webdriver.PhantomJS(service_args=[
'--load-images=false',
'--ignore-ssl-errors=true',
'--web-security=false'])
browser.set_window_size(1920, 1080)
output = open(self.output_dir + self.name + '.json', "a")
try:
for posts in self.get_post_list(browser, self.url):
for post in posts:
# TODO store post to disk
output.write(str(post).encode('utf-8'))
finally:
output.close()
browser.quit()
def get_post_list(self, browser, url):
"""This generator function pulls posts from `url` periodically.
"""
browser.get(url)
max_old_posts_num = 10
max_failed_scrolls = 3
old_posts_num = 0
failed_scrolls = 0
while True:
posts = BeautifulSoup(browser.page_source, 'html.parser')\
.select('.userContentWrapper._5pcr')
yield map(self.Post, posts[old_posts_num:])
old_posts_num = len(posts)
if old_posts_num > max_old_posts_num:
self.delete_posts(browser, old_posts_num - max_old_posts_num)
old_posts_num = max_old_posts_num
if not self.scroll_down(browser):
failed_scrolls += 1
else:
failed_scrolls = 0
if failed_scrolls > max_failed_scrolls:
break
time.sleep(self.interval)
class Post:
def __init__(self, post_css):
tag = post_css.find('abbr', {'class': '_5ptz'})
self.timestamp = tag['title'].partition(', ')[2]
self.id = tag['data-utime']
tag = post_css.find('a', {'class': '_5pcq'}, href=True)
self.link = 'www.facebook.com' + tag['href']
self.text = ''.join([p.get_text() for p in post_css.find_all('p')])
self.text = garble.normalize(self.text)
def __str__(self):
return json.dumps(self, default=lambda o: o.__dict__,
sort_keys=True, indent=4)
@staticmethod
def delete_posts(browser, num):
script = """
var list = document.querySelectorAll(".userContentWrapper._5pcr");
for(var i in list){
if(i >= %d)
break;
list[i].parentElement.parentElement.parentElement.parentElement.remove()
} """ % num
browser.execute_script(script)
@staticmethod
def scroll_down(browser):
height_before_scroll = browser.execute_script('return document.body.scrollHeight;')
browser.execute_script('window.scrollTo(0, document.body.scrollHeight);') # scroll to bottom
height_after_scroll = browser.execute_script('return document.body.scrollHeight;')
return height_after_scroll != height_before_scroll
if __name__ == '__main__':
FacebookCrawler('uthbadar', 'https://www.facebook.com/uthbadar/').run()
FacebookCrawler('senatorabetz', 'https://www.facebook.com/senatorabetz/').run()
以上是关于python Facebook帖子爬虫的主要内容,如果未能解决你的问题,请参考以下文章
python 关于LHL Facebook帖子评论的情绪分析,请访问https://www.facebook.com/leehsienloong/posts/1505690826160285