def initial_clean(text):
text = re.sub("((\S+)?(http(s)?)(\S+))|((\S+)?(www)(\S+))|((\S+)?(\@)(\S+)?)", " ", text)
text = re.sub("[^a-zA-Z ]", "", text)
text = text.lower() # lower case the text
text = nltk.word_tokenize(text)
return text
for file in glob.glob('*.csv'):
# a.txt => a-out.txt, b.txt => b-out.txt, etc.
with open(file.replace('.csv', '-out.csv'),'w') as outfile:
# read a line at time from input file and
# write a line at a time to output file
with open(file,errors='ignore') as infile:
for line in infile:
print(initial_clean(line), end='', file=outfile)