网络爬虫: 抓取allitebooks.com书籍信息及ISBN码, save books name and ISBN into .csv from backslash112
Posted XinZhou_Annie
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了网络爬虫: 抓取allitebooks.com书籍信息及ISBN码, save books name and ISBN into .csv from backslash112相关的知识,希望对你有一定的参考价值。
from urllib2 import urlopen from bs4 import BeautifulSoup import csv # Get the next page url from the current page url def get_next_page_url(url): page = urlopen(url) soup_page = BeautifulSoup(page, \'lxml\') page.close() # Get current page and next page tag current_page_tag = soup_page.find(class_="current") next_page_tag = current_page_tag.find_next_sibling() # Check if the current page is the last one if next_page_tag is None: next_page_url = None else: next_page_url = next_page_tag[\'href\'] return next_page_url # Get the book detail urls by page url def get_book_detail_urls(url): page = urlopen(url) soup = BeautifulSoup(page, \'lxml\') page.close() urls = [] book_header_tags = soup.find_all(class_="entry-title") for book_header_tag in book_header_tags: urls.append(book_header_tag.a[\'href\']) return urls # Get the book detail info by book detail url def get_book_isbn(url): page = urlopen(url) book_isbn_soup = BeautifulSoup(page, \'lxml\') page.close() title_tag = book_isbn_soup.find(class_="single-title") title = title_tag.string isbn_key_tag = book_isbn_soup.find(text="ISBN-10:").parent isbn_tag = isbn_key_tag.find_next_sibling() isbn = isbn_tag.string.strip() # Remove the whitespace with the strip method return [ title, isbn ] # return [\'title:\', title, \'isbn:\', isbn] def save_to_csv(book_info_list): path=\'/Users/zhouxin/Desktop/books.csv\' with open(path, \'w\') as fp: a = csv.writer(fp, delimiter=\',\') a.writerow([\'title\',\'isbn\']) a.writerows(book_info_list) def start(): url = "http://www.allitebooks.com/marketing-seo/" book_info_list = [] def next_page(page_url): book_detail_urls = get_book_detail_urls(page_url) for book_detail_url in book_detail_urls: #print all books ISBN one by one # print(book_detail_url) book_info = get_book_isbn(book_detail_url) # print (book_info) print",".join(book_info) # print ISBD book_info_list.append(book_info) next_page_url = get_next_page_url(page_url) if next_page_url is not None: next_page(next_page_url) else: return next_page(url) save_to_csv(book_info_list)
Output:
以上是关于网络爬虫: 抓取allitebooks.com书籍信息及ISBN码, save books name and ISBN into .csv from backslash112的主要内容,如果未能解决你的问题,请参考以下文章