LeetCode（多线程）- 1242. 多线程网页爬虫

Posted 2022-02-14 程序员牧码

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了LeetCode（多线程）- 1242. 多线程网页爬虫相关的知识，希望对你有一定的参考价值。

题目链接：点击打开链接

题目大意：略。

解题思路：略。

相关企业

Databricks

AC 代码

/**
 * // This is the htmlParser's API interface.
 * // You should not implement it, or speculate about its implementation
 * interface HtmlParser 
 *     public List<String> getUrls(String url) 
 * 
 */
class Solution 
    // 已知URL集合，存储当前可见的所有URL。
    private ConcurrentHashMap<String, Boolean> totalUrls = new ConcurrentHashMap<>();
    
    // 结果URL链表及对应锁。
    private ReentrantLock resultLock = new ReentrantLock();
    private LinkedList<String> resultUrls = new LinkedList<>();

    // 待抓取URL链表及对应锁。
    private ReentrantLock crawlLock = new ReentrantLock();
    private LinkedList<String> urlsToCrawl = new LinkedList<>();

    // 当前正在执行的工作线程个数。
    private AtomicInteger choreCount = new AtomicInteger(0);
    
    public List<String> crawl(String startUrl, HtmlParser htmlParser) 
        String hostName = extractHostName(startUrl);

        this.totalUrls.put(startUrl, true);

        addUrlToResult(startUrl); 
        addUrlToCrawl(startUrl);

        while (true) 
            String urlToCrawl = fetchUrlToCrawl();
            if (urlToCrawl != null) 
                incrChore();
                Chore chore = new Chore(this, hostName, htmlParser, urlToCrawl);
                (new Thread(chore)).start(); 
             else 
                if (this.choreCount.get() == 0) 
                    break;
                
                LockSupport.parkNanos(1L);
            
        

        return fetchResultUrls();
    

    private String extractHostName(String url) 
        // HTTP protocol only.
        String processedUrl = url.substring(7);
        
        int index = processedUrl.indexOf("/");
        if (index == -1) 
            return processedUrl;
         else 
            return processedUrl.substring(0, index);
        
    

    private class Chore implements Runnable 
        private Solution solution;
        private String hostName;
        private HtmlParser htmlParser; 
        private String urlToCrawl;

        public Chore(Solution solution, String hostName, HtmlParser htmlParser, String urlToCrawl) 
            this.solution = solution;
            this.hostName = hostName;
            this.htmlParser = htmlParser;
            this.urlToCrawl = urlToCrawl;
        

        @Override
        public void run() 
            try 
                filterUrls(this.htmlParser.getUrls(urlToCrawl));
             finally 
                this.solution.decrChore(); 
            
        

        private void filterUrls(List<String> crawledUrls) 
            if (crawledUrls == null || crawledUrls.isEmpty()) 
                return;
            

            for (String url : crawledUrls) 
                // 如果该URL在已知的URL集合中已存在，那么不需要再重复抓取。
                if (this.solution.totalUrls.containsKey(url)) 
                    continue;
                

                this.solution.totalUrls.put(url, true);
            
                String crawlHostName = this.solution.extractHostName(url);
                if (!crawlHostName.equals(this.hostName)) 
                    // 如果抓取的URL对应的HostName同Start URL对应的HostName不同，那么直接丢弃该URL。
                    continue;
                
                
                // 将该URL添加至结果链表。
                this.solution.addUrlToResult(url);
                // 将该URL添加至待抓取链表，以便进行下一跳抓取。
                this.solution.addUrlToCrawl(url);
            
        
    

    private void addUrlToResult(String url) 
        this.resultLock.lock();
        try 
            this.resultUrls.add(url);
         finally 
            this.resultLock.unlock();
        
    

    private List<String> fetchResultUrls() 
        this.resultLock.lock();
        try 
            return this.resultUrls;
         finally 
            this.resultLock.unlock();
        
    

    private void addUrlToCrawl(String url) 
        this.crawlLock.lock();
        try 
            this.urlsToCrawl.add(url);
         finally 
            this.crawlLock.unlock();
        
    
    
    private String fetchUrlToCrawl() 
        this.crawlLock.lock();
        try 
            return this.urlsToCrawl.poll();
         finally 
            this.crawlLock.unlock();
        
    

    private void incrChore() 
        this.choreCount.incrementAndGet();
    

    private void decrChore() 
        this.choreCount.decrementAndGet();

以上是关于LeetCode（多线程）- 1242. 多线程网页爬虫的主要内容，如果未能解决你的问题，请参考以下文章