java 从Pluralsight下载视频使用Selenium来刮擦

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了java 从Pluralsight下载视频使用Selenium来刮擦相关的知识,希望对你有一定的参考价值。

package selenuimScraper;

import org.apache.commons.io.FileUtils;

import java.io.*;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

/**
 * Created by liuyufei on 5/04/17.
 */
public class DownLoadVideo {

    static List<String> lines = new ArrayList<>();

    public static void main(String[] args) {
        try (BufferedReader br = new BufferedReader(new FileReader("Design Patterns in Java: Structural"))) {
            for (String line="";line != null;line=br.readLine()) {
                lines.add(line);
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        doDownLoad();

    }

    private static void doDownLoad() {
        int total = lines.size();
        for (int i = 0; i < lines.size(); i++) {
            String line = lines.get(i);
            String[] f2s = line.split("=#=");
            if (f2s != null && f2s.length == 2) {
                String fileName = f2s[0];
                String src = f2s[1];
                try {
                    URL website = new URL(src);
                    File f = new File((i+1)+"#"+fileName + ".mp4");
                    FileUtils.copyURLToFile(website, f);
                    System.out.println("download \"" + fileName + "\" done " + ((i+1) * 100) / total + "%");
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        }

    }

}
package selenuimScraper;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;

import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.*;

/**
 * Created by liuyufei on 5/04/17.
 */
public class SeleScraper {

    //keep the order
    static Map<String, String> fileName2Src = new LinkedHashMap<>();
    static Set<Cookie> cookieSet;

    static String courseURL = "https://app.pluralsight.com/library/courses/design-patterns-java-structural/table-of-contents";
    static String driverLocation = "/pathto/chromedriver";

    static String username = "username";
    static String password = "password";

    static WebDriver parent_driver;
    static WebDriver video_driver;
    static boolean firstHandleCookie = true;

    public static void main(String[] args) throws InterruptedException {

        System.setProperty("webdriver.chrome.driver", driverLocation);
        login();
        String courseName = getVideoList();
        for (Map.Entry<String, String> f2c : fileName2Src.entrySet()) {
            if (!openVideo(f2c.getKey(), f2c.getValue())) {
                System.out.println("cookie expired, login again and retry for " + f2c.getKey() + " " + f2c.getValue());
                relogin();
                //try again
                openVideo(f2c.getKey(), f2c.getValue());
            }
        }
        parent_driver.quit();
        video_driver.quit();
        writeToFile(courseName);
    }


    private static void relogin() throws InterruptedException {
        firstHandleCookie = true;
        parent_driver.quit();
        login();
    }


    private static String getVideoList() throws InterruptedException {
        System.out.println("getVideoList....");

        parent_driver.get(courseURL);
        Thread.sleep(3000);

        WebElement titleElement = parent_driver.findElement(By.xpath("//*[@id=\"ps-main\"]/div/div/section/div[1]/div[2]/h1"));

        String courseName = titleElement.getText();
        List<WebElement> elements = parent_driver.findElements(By.cssSelector("a.table-of-contents__clip-list-title"));
        //have to use jsoup to get invisible videos' titles
        Document doc = Jsoup.parse(parent_driver.getPageSource());
        Elements videoList = doc.select("a.table-of-contents__clip-list-title");

        for (int i = 0; i < videoList.size(); i++) {
            //using i to make key different
            fileName2Src.put(i+"_"+videoList.get(i).html(), elements.get(i).getAttribute("href"));
        }

        return courseName;

    }


    private static void login() throws InterruptedException {
        System.out.println("Login....");
        parent_driver = new ChromeDriver();
        parent_driver.get("https://app.pluralsight.com/id");
        Thread.sleep(3000);  // Let the user actually see something!
        WebElement usernameEle = parent_driver.findElement(By.xpath("//*[@id=\"Username\"]"));
        WebElement passwordEle = parent_driver.findElement(By.xpath("//*[@id=\"Password\"]"));

        usernameEle.sendKeys(username);
        passwordEle.sendKeys(password);
        passwordEle.submit();

        parent_driver.get("https://app.pluralsight.com/library");
        cookieSet = parent_driver.manage().getCookies();
    }


    private static boolean openVideo(String fileName, String videoPage) throws InterruptedException {
        System.out.println("openVideo src " + videoPage);
        if (video_driver == null) {
            System.out.println("init video driver...");
            video_driver = new ChromeDriver();
        }
        video_driver.get(videoPage);
        handleCookie(video_driver);
        Thread.sleep(3000);

        WebElement video = video_driver.findElement(By.xpath("//*[@id=\"vjs_video_3_html5_api\"]"));
        //if video is null, may need to login
        String video_src = video.getAttribute("src");
        if (video_src == null || video_src.length() == 0) {
            System.out.println("Quit video driver...");
            video_driver.quit();
            return false;
        }
        //overwrite value to true video url
        fileName2Src.put(fileName, video_src);
        return true;
    }

    private static void handleCookie(WebDriver driver) {
        if (firstHandleCookie) {
            cookieSet.forEach(c -> {
                driver.manage().addCookie(c);
                System.out.println(c);
            });
            firstHandleCookie = false;
        }
    }


    private static void writeToFile(String courseName) {
        try (BufferedWriter bw = new BufferedWriter(new FileWriter(courseName))) {
            for (Map.Entry<String, String> src : fileName2Src.entrySet()) {
                bw.write(src.getKey() + "=#=" + src.getValue() + "\n");
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

以上是关于java 从Pluralsight下载视频使用Selenium来刮擦的主要内容,如果未能解决你的问题,请参考以下文章

Pluralsight 学习感受

Pluralsight 学习感受

CQRS 示例和截屏视频 [关闭]

Java SE——:接口和抽象类

java相关网址汇总1

下载受保护的m3u8视频