import csv
import requests
from bs4 import BeautifulSoup
def parse_text(stags,soup):
# initialize list
lfound = list()
# search tags into html code
g_data = soup.find_all(stags)
# process found goals
for item in g_data:
try:
lfound.append(item.text)
except Exception,e:
print "WARNING: %s"%str(e)
pass
return lfound
def parse_att(stags,sattribute,soup):
# initialize list
lfound = list()
# search tags into html code
g_data = soup.find_all(stags)
# process found goals
for item in g_data:
try:
lfound.append(item.get(sattribute))
except Exception,e:
print "WARNING: %s"%str(e)
pass
return lfound
if __name__ == "__main__":
for i in range(1):
# url to parse
url="http://www.nytimes.com/"
print url
# build Beautiful object
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
## parse text of "p" html tag
print parse_text("p",soup)
## parse "src" attribute of "img" html tag
print parse_att("img","src",soup)