Python批量下载MOOC课件

Posted 2021-09-04 Gendan

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了Python批量下载MOOC课件相关的知识，希望对你有一定的参考价值。

coding: utf-8

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import *
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
requests.adapters.DEFAULT_RETRIES = 5
import time
import os
import re
driver = webdriver.Chrome()

driver = webdriver.FireFox()

wait = WebDriverWait(driver, 10)
def download(url, file_name):

headers = {
    \'Host\': \'hubble.netease.com\',
    \'Origin\': \'https://www.icourse163.org\',
    \'Referer\': url.split("#")[0],
    \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/69.0.3497.92 Safari/537.36\'
}
if not os.path.exists(file_name) or os.path.getsize(file_name) <= 10:
    with open(file_name, "wb") as f:
        r = requests.get(url, headers=headers, verify=False)
        f.write(r.content)
        f.close()
        print("\\t下载成功：{}".format(file_name))
else:
    print("\\t文件已存在：{}".format(file_name))

课件地址存储路径范围a, b

def get_courseware(courseware_url, path, c_range=[0, 0]):

t = 0
while t < 2:
    try:
        driver.get(courseware_url)
        h3 = wait.until(
            EC.element_to_be_clickable(
                (By.CSS_SELECTOR, "#g-body > div.m-learnhead > div > div > div > a.f-fl > h4"))
        )
        school_name = re.findall(r\'/([a-zA-Z]+)-\', courseware_url)[0]
        title = h3.text
        path_1 = os.path.join(path, title + "_" + school_name)
        if not os.path.exists(path_1):
            os.makedirs(path_1)
        path = os.path.join(path_1, "courseware")
        if not os.path.exists(path):
            os.makedirs(path)
        # 总章节数
        h3_count = len(driver.find_elements_by_css_selector(
            "div > div.m-learnChapterList> div.m-learnChapterNormal > div.titleBox > h3"))
        if c_range[1] == 0:
            c_range2 = h3_count
        else:
            c_range2 = c_range[1]
        for index in range(3 + c_range[0], 3 + c_range2):
            driver.refresh()
            h3 = wait.until(
                EC.element_to_be_clickable((By.CSS_SELECTOR,
                                            "div > div.m-learnChapterList> div.m-learnChapterNormal:nth-child(3) > div.titleBox > h3"))
            )
            h3.click()
            h3 = wait.until(
                EC.element_to_be_clickable((By.CSS_SELECTOR,
                                            "div > div.m-learnChapterList> div.m-learnChapterNormal:nth-child({}) > div.titleBox > h3".format(
                                                index)))
            )
            h3_text = h3.text
            print("{}:".format(h3_text), end="\\t")
            patten = re.compile(\'.*?第[期货](https://www.gendan5.com/futures.html)(.{1,3})(周|章).*?\')
            match = re.match(patten, h3_text)
            if match:
                week = match.group(0)
            else:
                week = h3_text
            h3.click()
            time.sleep(3)
            #                 file_count = len(driver.find_elements_by_xpath(\'//div[@class="f-icon lsicon f-fl "]/span[@class="u-icon-doc"]\'))
            file_count = len(driver.find_elements_by_xpath(\'//div[@class="sourceList"]/*[@title="文档讲稿"]\'))
            print(file_count)
            h4_count = len(driver.find_elements_by_css_selector(\'div.u-learnLesson > h4\'))
            for h4_index in range(1, h4_count + 1):
                h4 = wait.until(
                    EC.element_to_be_clickable(
                        (By.CSS_SELECTOR, \'div.u-learnLesson:nth-of-type({}) > h4.j-name\'.format(h4_index)))
                )
                # 标题4
                h4str = h4.text
                file_count = len(driver.find_elements_by_css_selector(
                    f\'div.u-learnLesson:nth-of-type({h4_index}) > div.sourceList > div[title^="文档"]\'))
                for f_index in range(1, file_count + 1):
                    title = wait.until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR,

f\'div.u-learnLesson:nth-of-type({h4_index}) > div.sourceList > div[title^="文档"]\'))

                    )
                    titlestr = title.get_attribute("title")
                    title.click()
                    time.sleep(0.2)
                    download_btn = wait.until(
                        EC.element_to_be_clickable((By.PARTIAL_LINK_TEXT, \'文档下载\'))
                    )
                    download_url = download_btn.get_attribute("href")
                    week = week.replace(":", "-").replace("/", " ").replace("\\\\", " ").replace("课件：", " ").replace(
                        "：", " ")
                    titlestr = f\'{h4str} {titlestr}\'
                    title = titlestr.replace(":", "-").replace("/", " ").replace("\\\\", " ").replace("课件：",

" ").replace(

                        "：", " ").replace("/", " ")
                    print(week, "   ", title)
                    file_name = path + "\\\\" + week + " " + "".join(title.split()).replace("：", " ") + "." + \\
                                download_url.split(".")[-1].split(\'&\')[0]
                    print(file_name)
                    download(download_url, file_name)
                    driver.back()
                    time.sleep(1)
                    h3 = wait.until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR,
                                                    "div > div.m-learnChapterList> div.m-learnChapterNormal:nth-child(3) > div.titleBox > h3"))
                    )
                    h3.click()
                    h3 = wait.until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR,
                                                    "div > div.m-learnChapterList> div.m-learnChapterNormal:nth-child({}) > div.titleBox > h3".format(
                                                        index)))
                    )
                    h3.click()
        t = 5
    except FileNotFoundError:
        print("FileNotFoundError: [Errno 2] No such file or directory: ")
        t += 1

def main():

courseware_url = \'https://www.icourse163.org/learn/XDU-1001638014?tid=1462808447#/learn/content\'
path = r"D:\\大二下\\信号与系统\\中国大学MOOC"
# 课件地址  存储路径  范围[a, b](第a章到第b章，默认[0, 0]表示全部)
get_courseware(courseware_url, path, [0, 0])
driver.quit()  # 退出浏览器

if name == \'__main__\':

main()

以上是关于Python批量下载MOOC课件的主要内容，如果未能解决你的问题，请参考以下文章

Python批量下载MOOC课件

coding: utf-8

driver = webdriver.FireFox()

课件地址 存储路径 范围a, b

课件地址存储路径范围a, b