java 从Pluralsight下载视频使用Selenium来刮擦
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了java 从Pluralsight下载视频使用Selenium来刮擦相关的知识,希望对你有一定的参考价值。
package selenuimScraper;
import org.apache.commons.io.FileUtils;
import java.io.*;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
/**
* Created by liuyufei on 5/04/17.
*/
public class DownLoadVideo {
static List<String> lines = new ArrayList<>();
public static void main(String[] args) {
try (BufferedReader br = new BufferedReader(new FileReader("Design Patterns in Java: Structural"))) {
for (String line="";line != null;line=br.readLine()) {
lines.add(line);
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
doDownLoad();
}
private static void doDownLoad() {
int total = lines.size();
for (int i = 0; i < lines.size(); i++) {
String line = lines.get(i);
String[] f2s = line.split("=#=");
if (f2s != null && f2s.length == 2) {
String fileName = f2s[0];
String src = f2s[1];
try {
URL website = new URL(src);
File f = new File((i+1)+"#"+fileName + ".mp4");
FileUtils.copyURLToFile(website, f);
System.out.println("download \"" + fileName + "\" done " + ((i+1) * 100) / total + "%");
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
}
package selenuimScraper;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.*;
/**
* Created by liuyufei on 5/04/17.
*/
public class SeleScraper {
//keep the order
static Map<String, String> fileName2Src = new LinkedHashMap<>();
static Set<Cookie> cookieSet;
static String courseURL = "https://app.pluralsight.com/library/courses/design-patterns-java-structural/table-of-contents";
static String driverLocation = "/pathto/chromedriver";
static String username = "username";
static String password = "password";
static WebDriver parent_driver;
static WebDriver video_driver;
static boolean firstHandleCookie = true;
public static void main(String[] args) throws InterruptedException {
System.setProperty("webdriver.chrome.driver", driverLocation);
login();
String courseName = getVideoList();
for (Map.Entry<String, String> f2c : fileName2Src.entrySet()) {
if (!openVideo(f2c.getKey(), f2c.getValue())) {
System.out.println("cookie expired, login again and retry for " + f2c.getKey() + " " + f2c.getValue());
relogin();
//try again
openVideo(f2c.getKey(), f2c.getValue());
}
}
parent_driver.quit();
video_driver.quit();
writeToFile(courseName);
}
private static void relogin() throws InterruptedException {
firstHandleCookie = true;
parent_driver.quit();
login();
}
private static String getVideoList() throws InterruptedException {
System.out.println("getVideoList....");
parent_driver.get(courseURL);
Thread.sleep(3000);
WebElement titleElement = parent_driver.findElement(By.xpath("//*[@id=\"ps-main\"]/div/div/section/div[1]/div[2]/h1"));
String courseName = titleElement.getText();
List<WebElement> elements = parent_driver.findElements(By.cssSelector("a.table-of-contents__clip-list-title"));
//have to use jsoup to get invisible videos' titles
Document doc = Jsoup.parse(parent_driver.getPageSource());
Elements videoList = doc.select("a.table-of-contents__clip-list-title");
for (int i = 0; i < videoList.size(); i++) {
//using i to make key different
fileName2Src.put(i+"_"+videoList.get(i).html(), elements.get(i).getAttribute("href"));
}
return courseName;
}
private static void login() throws InterruptedException {
System.out.println("Login....");
parent_driver = new ChromeDriver();
parent_driver.get("https://app.pluralsight.com/id");
Thread.sleep(3000); // Let the user actually see something!
WebElement usernameEle = parent_driver.findElement(By.xpath("//*[@id=\"Username\"]"));
WebElement passwordEle = parent_driver.findElement(By.xpath("//*[@id=\"Password\"]"));
usernameEle.sendKeys(username);
passwordEle.sendKeys(password);
passwordEle.submit();
parent_driver.get("https://app.pluralsight.com/library");
cookieSet = parent_driver.manage().getCookies();
}
private static boolean openVideo(String fileName, String videoPage) throws InterruptedException {
System.out.println("openVideo src " + videoPage);
if (video_driver == null) {
System.out.println("init video driver...");
video_driver = new ChromeDriver();
}
video_driver.get(videoPage);
handleCookie(video_driver);
Thread.sleep(3000);
WebElement video = video_driver.findElement(By.xpath("//*[@id=\"vjs_video_3_html5_api\"]"));
//if video is null, may need to login
String video_src = video.getAttribute("src");
if (video_src == null || video_src.length() == 0) {
System.out.println("Quit video driver...");
video_driver.quit();
return false;
}
//overwrite value to true video url
fileName2Src.put(fileName, video_src);
return true;
}
private static void handleCookie(WebDriver driver) {
if (firstHandleCookie) {
cookieSet.forEach(c -> {
driver.manage().addCookie(c);
System.out.println(c);
});
firstHandleCookie = false;
}
}
private static void writeToFile(String courseName) {
try (BufferedWriter bw = new BufferedWriter(new FileWriter(courseName))) {
for (Map.Entry<String, String> src : fileName2Src.entrySet()) {
bw.write(src.getKey() + "=#=" + src.getValue() + "\n");
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
以上是关于java 从Pluralsight下载视频使用Selenium来刮擦的主要内容,如果未能解决你的问题,请参考以下文章