[Java] 用java实现的电影天堂,飘花电影网的电影的下载地址抓取

Posted 2022-08-26 程宇寒

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了[Java] 用java实现的电影天堂,飘花电影网的电影的下载地址抓取相关的知识，希望对你有一定的参考价值。

1.之前看了一些论坛上有一个坛友用python写的抓取电影下载链接的，于是心血来潮的我也打算用java来写一个！其实并不是很难，下面附上代码
这是对电影天堂的电影的抓取的方法，（在此期间尝试设置代过滤理，以及用线程池，但貌似均没有成功）说明下主要的jar包主要有httpclient4.5以及jsoup1.7

1.

package downloade;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
 
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
 
import com.sun.corba.se.spi.orbutil.threadpool.ThreadPool;
 
import Pojo.DyUrl;
import dao.JDBCUtils;
 
public class Dyttdownload 
    static int id=1;
    public static HttpClient client=null;
     
public static void main(String[] args) 
//ExecutorService fixedThreadPool = Executors.newFixedThreadPool(10);
           Map<Integer,String> map=new HashMap<>();
    for(int i=1;i<50;i++)
//    http://www.ygdy8.net/html/gndy/dyzz/list_23_2.html
        map.put(i,"http://www.ygdy8.net/html/gndy/dyzz/list_23_"+i+".html" );
    
           for (String string : map.values()) 
             getUrl(string);
//                      Thread.currentThread().sleep(2000);
             
        
 
//  getDownloadUrl("http://www.ygdy8.net/html/gndy/dyzz/20170926/55094.html");

 
public static void  getUrl(String uri)
    JDBCUtils utils=new JDBCUtils();
 
    try 
        client=HttpClients.createDefault();
//  RequestConfig config=RequestConfig.custom().setProxy(new HttpHost("110.73.14.161",8123)).build();
    HttpGet get=new HttpGet(uri);
//  get.setConfig(config);
    HttpResponse response=client.execute(get);
        String result =EntityUtils.toString(response.getEntity(),"gb2312");
        Document doc=Jsoup.parse(result);
        //css选择器
    Elements elements=  doc.select("table.tbspan ");
  for (Element element : elements) 
      element.setBaseUri("http://www.ygdy8.net");
DyUrl dy=getDownloadUrl(element.select("tr").get(1).select("a").text(),element.select("tr").get(1).select("a").attr("abs:href"));
dy.setId(id);
utils.insert(dy);
id++;
  
 
 
     catch (IOException e) 
        // TODO Auto-generated catch block
        e.printStackTrace();
    

public static DyUrl  getDownloadUrl(String name,String dyurl)
    DyUrl dy=new DyUrl();
//  RequestConfig config=RequestConfig.custom().setProxy(new HttpHost("110.73.14.161",8123)).build();
    try 
        client=HttpClients.createDefault();
        HttpGet get =new HttpGet(dyurl);
//      get.setConfig(config);
        HttpResponse response=client.execute(get);
        String result =EntityUtils.toString(response.getEntity(),"gb2312");
        Document doc=Jsoup.parse(result);
          Elements elements=doc.select("div#Zoom table tr td a ");
          dy.setDyname(name);
          dy.setDyUrl(elements.get(0).text());
 
     catch (IOException e) 
        // TODO Auto-generated catch block
        e.printStackTrace();
    
    return dy;

2.下面的飘花电影网的，其实可以看到爬取的过程是大同小异的，只是选择器有所差别而已

package downloade;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
 
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
 
import Pojo.DyUrl;
import dao.JDBCUtils;
 
public class piaohuadownload 
    static int id=1;
    public static HttpClient client=null;
public static void main(String[] args) 
    Map<Integer,String> map=new HashMap<>();
for(int i=16;i<50;i++)
  map.put(i,"http://www.piaohua.com/html/dongzuo/list_"+i+".html");

for (String string : map.values()) 
 
System.out.println("正在爬这个"+string+"网页");
        // TODO Auto-generated method stub
        getUrl(string);
     

 
   
     
public static void getUrl(String uri)
    JDBCUtils utils=new JDBCUtils();
try 
    client =HttpClientBuilder.create().build();
    HttpResponse response=client.execute(new HttpGet(uri));
    String result =EntityUtils.toString(response.getEntity(),"utf-8");
    Document doc=Jsoup.parse(result);
    doc.setBaseUri("http://www.piaohua.com");
    Elements elements=doc.select("#list dl");
    for (Element element : elements) 
        String name=element.select("font").first().text();
        String dyurl=element.select("a").first().absUrl("href");
    DyUrl dy=getDownloadUrl(name, dyurl);
    dy.setId(id);
    utils.insert(dy);
    id++;
    
 catch (IOException e) 
    // TODO Auto-generated catch block
    e.printStackTrace();


 
public static DyUrl getDownloadUrl(String name,String dyurl)
    DyUrl dUrl=new DyUrl();
    try 
        client=HttpClients.createDefault();
        HttpResponse response;
        response = client.execute(new HttpGet(dyurl));
        String result =EntityUtils.toString(response.getEntity(),"utf-8");
        Document doc=Jsoup.parse(result);
        Elements elements=doc.select("#showinfo").select("a");
        dUrl.setDyname(name);
        dUrl.setDyUrl(elements.first().text());
     catch (IOException e) 
        // TODO Auto-generated catch block
        e.printStackTrace();
    
return dUrl;

最后附上成功的截图
最后一张是在网页上的应用

mv.png (47.25 KB, 下载次数: 3)

渲染到网页中

dy.png (107.31 KB, 下载次数: 1)

以上是关于[Java] 用java实现的电影天堂,飘花电影网的电影的下载地址抓取的主要内容，如果未能解决你的问题，请参考以下文章