爬取futurelearn课程网
Posted yangbiao6
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬取futurelearn课程网相关的知识,希望对你有一定的参考价值。
1 import requests 2 from bs4 import BeautifulSoup 3 import bs4 4 import re 5 import csv 6 #import pandas as pd 7 # r = requests.get("https://www.futurelearn.com/courses/climate-change-the-solutions,timeout = 30") 8 # r.encoding = r.apparent_encoding 9 # print(r.text) 10 def gethtmlText(url,code=‘utf-8‘): 11 try: 12 r = requests.get(url,timeout = 30) 13 r.raise_for_status() 14 r.encoding = r.apparent_encoding 15 return r.text 16 except: 17 print("获取失败") 18 #爬取基本列表 19 def getCourseList(lst,html): 20 soup = BeautifulSoup(html,‘html.parser‘) 21 div_href = soup.find_all(‘div‘,class_= ‘m-grid-of-cards m-grid-of-cards--compact‘) 22 cop = re.compile(‘href="(.*?)">‘) 23 href_get = re.findall(cop,str(div_href)) 24 m = r‘" role="button‘ 25 z = "" 26 j = ‘courses‘ 27 #print(href_get) 28 #print(type(href_get)) 29 for i in href_get: 30 j = ‘courses‘ 31 if j in i: 32 if m in i: 33 i=i.replace(m,z) 34 #print("https://www.futurelearn.com"+i) 35 else: 36 pass 37 #print("https://www.futurelearn.com"+i) 38 i = "https://www.futurelearn.com"+i 39 lst.append(i) 40 # print(lst) 41 else: 42 pass 43 def CourseList(lst): 44 list_1 = [] 45 for i in lst: 46 list_ = [] 47 url = i 48 r = requests.get(url,timeout = 30) 49 50 r.encoding = r.apparent_encoding 51 html=r.text 52 x = BeautifulSoup(html,‘html.parser‘) 53 54 for i in x.find_all(‘h1‘,class_ = ‘m-dual-billboard__heading‘): 55 list_.append(i.text) 56 #print(list_) 57 58 for a in x.find_all(‘span‘,class_ = ‘m-key-info__content‘): 59 list_.append(a.text) 60 print(list_) 61 62 for b in x.find(‘div‘,class_ = "a-content a-content--tight").find_all("p"): 63 #print(a.text) 64 list_.append(b.text) 65 print(list_) 66 67 c = x.find(‘section‘,class_ = ‘a-section a-section--alt-adjacent‘).find_all(‘p‘): 68 list_.append(c.text) 69 print(list_) 70 # list_1.append(list_) 71 # print(list_1) 72 73 74 # def write_dictionary_to_csv(list_1,filename): 75 # file_name=‘{}.csv‘.format(filename) 76 # name = [‘课程名‘,‘课时‘,‘学习任务‘,‘课程性质‘,‘额外费用‘,‘介绍‘,‘话题‘,‘开始时间‘,‘服务对象‘,‘‘] 77 # test = pd.DataFrame(columns = name,data = list_1) 78 # test.to_csv(file_name) 79 80 def main(): 81 star_url = "https://www.futurelearn.com/courses/categories/science-engineering-and-maths-courses?all_courses=1" 82 infoList = [] 83 url = star_url 84 file_name_ = ‘courses‘ 85 html = getHTMLText(url) 86 getCourseList(infoList,html) 87 88 CourseList(infoList) 89 #write_dictionary_to_csv(infoList,file_name_) 90 91 main()
以上是关于爬取futurelearn课程网的主要内容,如果未能解决你的问题,请参考以下文章
使用selenium + chrome爬取中国大学Mooc网的计算机学科的所有课程链接