C# 抓取网页的img src带参数的图片链接,并下载

Posted 代码描绘人生

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了C# 抓取网页的img src带参数的图片链接,并下载相关的知识,希望对你有一定的参考价值。

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Windows.Forms;

namespace ImageCollection
{
    public partial class Form1 : Form
    {
        private static string Path = AppDomain.CurrentDomain.BaseDirectory + "img";
        public Form1()
        {
            InitializeComponent();
        }

        private void btnshuaqu_Click(object sender, EventArgs e)
        {
            string url = txturl.Text.Trim();
            if (string.IsNullOrEmpty(url))
            {
                MessageBox.Show("请输入URl");
                return;
            }
            txtimg.AppendText("开始抓取中:\\r\\n");
            Thread th = new Thread(() => ShuaQu(url)) { IsBackground = true };
            th.Start();
        }

        private void ShuaQu(string url)
        {
            DirectoryInfo di = new DirectoryInfo(Path);
            if (System.IO.Directory.Exists(Path))
            {
                di.Delete(true);
            }
            System.IO.Directory.CreateDirectory(Path);
            string result = WebHttp.HttpGet(url, null, 3);
            string[] str = GethtmlImageUrlList(result);
            txtimg.Invoke(new Action(() =>
            {
                txtimg.AppendText("已经获取到数据!"+str.Count() + "\\r\\n");
            }));
            //建立获取网页标题正则表达式  
            String regex = @"<title>.+</title>";

            //返回网页标题  
            String title = Regex.Match(result, regex).ToString();
            txttitle.Invoke(new Action(() => {
                txttitle.Text = Regex.Replace(title, @"[\\""]+", ""); 
            }));
            foreach (string s in str)
            {
                Uri u = new Uri(s);
                if (u.Host == "www.xxx.com")
                {
                    Thread downimg = new Thread(() => Get_img(s)) { IsBackground = true };
                    downimg.Start();
                    txtimg.Invoke(new Action(() => {
                        txtimg.AppendText(s + "\\r\\n");
                    }));
                } 
            }
            txtimg.Invoke(new Action(() =>
            {
                txtimg.AppendText("全部抓取完成!\\r\\n");
            }));
        }

        public void Get_img(string imgpath)
        {
            

            string[] file = imgpath.Split(\'?\');
            string name = System.IO.Path.GetFileName(file[0]);
            WebClient mywebclient = new WebClient();
            mywebclient.DownloadFile(imgpath, Path + @"\\" + name);
            //Bitmap img = null;
            //HttpWebRequest req;
            //HttpWebResponse res = null;
            //try
            //{
            //    System.Uri httpUrl = new System.Uri(imgpath);
            //    req = (HttpWebRequest)(WebRequest.Create(httpUrl));
            //    req.Timeout = 180000; //设置超时值10秒
            //    req.UserAgent = "XXXXX";
            //    req.Accept = "XXXXXX";
            //    req.Method = "GET";
            //    res = (HttpWebResponse)(req.GetResponse());
            //    img = new Bitmap(res.GetResponseStream());//获取图片流                
            //    img.Save(Path + @"\\"+name);//随机名
            //}

            //catch (Exception ex)
            //{
            //    string aa = ex.Message;
            //}
            //finally
            //{
            //    res.Close();
            //}
        }


        /// <summary> 
        /// 取得HTML中所有图片的 URL。 
        /// </summary> 
        /// <param name="sHtmlText">HTML代码</param> 
        /// <returns>图片的URL列表</returns> 
        private string[] GetHtmlImageUrlList(string sHtmlText)
        {
            // 定义正则表达式用来匹配 img 标签 
            Regex regImg = new Regex(@"<img\\b[^<>]*?\\bsrc[\\s\\t\\r\\n]*=[\\s\\t\\r\\n]*[""\']?[\\s\\t\\r\\n]*(?<imgUrl>[^\\s\\t\\r\\n""\'<>]*)[^<>]*?/?[\\s\\t\\r\\n]*>", RegexOptions.IgnoreCase);

            // 搜索匹配的字符串 
            MatchCollection matches = regImg.Matches(sHtmlText);
            int i = 0;
            string[] sUrlList = new string[matches.Count];

            // 取得匹配项列表 
            foreach (Match match in matches)
                sUrlList[i++] = match.Groups["imgUrl"].Value;
            return sUrlList;
        }
    }
}

 

 

#region 下载图片到Image
public static Image UrlToImage(string url) {
    WebClient mywebclient = new WebClient();
    byte[] Bytes = mywebclient.DownloadData(url);
    using (MemoryStream ms = new MemoryStream(Bytes)) {
        Image outputImg = Image.FromStream(ms);
        return outputImg;
    }
}
#endregion

 

以上是关于C# 抓取网页的img src带参数的图片链接,并下载的主要内容,如果未能解决你的问题,请参考以下文章

img src=路径 总是显示不出图片 老是一把XX 新手学网页求解决

C# 抓取并导出网页里面所有超链接方法

php抓取一个页面的图片

将html转换成canvas

用JS将图片链接从 <img src="xx/img.jpg"> 改为 <img src="xx/

[当我用cheerio抓取img src时,我得到了一个巨大的字符串,而不仅仅是链接