__author__ = 'sean'
from bs4 import BeautifulSoup
import os
import cPickle as pickle
path = '/Users/sean/ml/dataset/pubmed-bioinfo-abstracts/paperAbstracts/'
filenames = os.listdir(path)
txt_corpus = list()
for thefile in filenames:
print thefile
# deal with the damn .DS_Store file in MAC
if thefile == ".DS_Store":
continue
with open(path + thefile, "rb") as f:
strings = f.read()
soup = BeautifulSoup(strings)
for hit in soup.findAll(attrs={'class' : 'abstract_text'}):
abstract = hit.contents[1].text
txt_corpus.append(abstract)
print 'done'
with open('pubmed_abstract.pkl', 'wb') as dicpkl:
pickle.dump(txt_corpus, dicpkl)
print 'pickle saved'