python WIkipedia_Scrape_Lists

Posted 2021-05-09

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了python WIkipedia_Scrape_Lists相关的知识，希望对你有一定的参考价值。

import wikipedia
import requests
from bs4 import BeautifulSoup
import time
import numpy as np

# first pull the HTML from the page that links to all of the pages with the links.
# in this case, this page gives the links list pages of sci-fi films by decade.
# just go to https://en.wikipedia.org/wiki/Lists_of_science_fiction_films
# to see what I'm pulling from.
html = requests.get('https://en.wikipedia.org/wiki/Lists_of_science_fiction_films')

#turn the HTML into a beautiful soup text object
b = BeautifulSoup(html.text, 'lxml')
# create an mpty list where those links will go.
links = []

# in this case, all of the links we're in a '<li>' brackets.
for i in b.find_all(name = 'li'):
    # pull the actual link for each one
    for link in i.find_all('a', href=True):
        links.append(link['href'])
# the above code ends up pulling more links than I want,
# so I just use the ones I want
links = links[1:11]
# each link only returns something like 'wiki/List_of_science_fiction_films_of_the_1920s'
# so I add the other part of the URL to each.
decade_links = ['https://en.wikipedia.org' + i for i in links]

# create two new lists, one for the title of the page, 
# and one for the link to the page
film_titles = []
film_links = []
# for loop to pull from each decade page with list of films.
# look at https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1920s
# to follow along as an exampe
for decade in decade_links:
    print(f'Collecting films from {decade}')
    html = requests.get(decade)
    b = BeautifulSoup(html.text, 'lxml')
    # get to the table on the page
    for i in b.find_all(name='table', class_='wikitable'):
        # get to the row of each film
        for j in i.find_all(name='tr'):
            #get just the title cell for each row.
            # contains the title and the URL
            for k in j.find_all(name='i'):
                # get within that cell to just get the words
                for link in k.find_all('a', href=True):
                    # get the title and add to the list
                    film_titles.append(link['title'])
                    # get the link and add to that list
                    film_links.append(link['href'])
    #be a conscientious scraper and pause between scrapes
    time.sleep(1)
print(f'Number of Film Links Collected: {len(film_links)}')
print(f'Number of Film Titles Collected: {len(film_titles)}')
# remove film links that don't have a description page on Wikipedia
new_film_links = [i for i in film_links if 'redlink' not in i]
# same goes for titles
new_film_titles = [i for i in film_titles if '(page does not exist)' not in i]
print(f'Number of Film Links with Wikipedia Pages: {len(new_film_links)}')
print(f'Number of Film Titles with Wikipedia Pages: {len(new_film_titles)}')
#use this list to fetch from the API
title_links = list(zip(new_film_titles, new_film_links))

以上是关于python WIkipedia_Scrape_Lists的主要内容，如果未能解决你的问题，请参考以下文章

Python代写，Python作业代写，代写Python，代做Python