采集一个网站上的所有图片

时间：2011-03-02 来源：微软中国

它这个网站。看URL就知道。可以循环，然后一个页面里面有N个文章。一个文章里面又分了几个页面。所以。

采集思路。就是循环。从大范围开始循环到小范围然后取出所有的图片、

View Code

  1 protected void Button1_Click(object sender, EventArgs e)
  2         {
  3             string UrlBegin = string.Empty;//开始URL
  4              for (int c = 1; c < 168; c++)//首先从第一页开始到最后一页。全部数据
  5              {
  6                 UrlBegin = "http://babe.1626.com/page/" + c;//开始页数
  7  
  8                 string HtmlText = GetHtmlSource(UrlBegin, Encoding.UTF8);
  9 
 10                 if (string.IsNullOrEmpty(HtmlText.Trim()))
 11                 {
 12                     return;
 13                 }
 14                 var html = new HtmlAgilityPack.HtmlDocument();
 15                 html.OptionFixNestedTags = true;
 16                 html.LoadHtml(HtmlText);
 17 
 18                 var document = html.DocumentNode;
 19 
 20                 //取出当前页的所有单个文章的Url
 21                 List<string> PageUrlArticleLink = new List<string>();
 22                 List<string> TitleList = new List<string>();
 23 
 24                 var PageCount = document.QuerySelectorAll(".dialog").Count();
 25 
 26                 for (int i = 0; i < PageCount; i++)
 27                 {
 28                     var Title = document.QuerySelectorAll(".dialog .post .title").ToArray()[i].InnerText;
 29                     string UrlLink = document.QuerySelectorAll(".dialog .post .title").ToArray()[i].InnerHtml;
 30 
 31                     TitleList.Add(Title);
 32                     PageUrlArticleLink.Add(MatchUrl(UrlLink));
 33                 }
 34 
 35                 for (int j = 0; j < PageUrlArticleLink.Count; j++)
 36                 {
 37                     GetPageSizeUrl(PageUrlArticleLink[j]);
 38                 }
 39             }
 40         }
 41 
 42         /// <summary>
 43         ///取出一个文章的图片
 44         /// </summary>
 45         /// <param name="UrlArticleLink">文章Url</param>
 46         /// <returns></returns>
 47         public static string GetPageSizeUrl(string UrlArticleLink)
 48         {
 49             //UrlArticleLink = "http://babe.1626.com/pages/29436";
 50             string HtmlText = GetHtmlSource(UrlArticleLink, Encoding.UTF8);
 51             if (string.IsNullOrEmpty(HtmlText.Trim()))
 52             {
 53                 return "";
 54             }
 55             var html = new HtmlAgilityPack.HtmlDocument();
 56             html.OptionFixNestedTags = true;
 57             html.LoadHtml(HtmlText);
 58 
 59             var document = html.DocumentNode;
 60 
 61             string GirlsName = document.QuerySelectorAll("title").ToArray()[0].InnerText;
 62 
 63             //看是否有下一页
 64             var PageSize = document.QuerySelectorAll(".flickr p").Count();
 65             List<string> UrlPageSize = new List<string>();
 66             UrlPageSize.Add(UrlArticleLink);
 67 
 68             //如果有分页，则加到list里面
 69             if (PageSize != 0)
 70             {
 71                 int a = document.QuerySelectorAll(".flickr p a").Count();//读取可能有问题不知道对方用啥分页控件
 72                 if (a > 0)
 73                 {
 74                     for (int i = 0; i < a; i++)
 75                     {
 76                         string PageSizeUrl = document.QuerySelectorAll(".flickr p a").ToArray()[i].OuterHtml;
 77                         UrlPageSize.Add(MatchUrl(PageSizeUrl));
 78                     }
 79                 }
 80             }
 81             List<string> AllImgUrl = new List<string>();
 82 
 83             for (int i = 0; i < UrlPageSize.Count; i++)
 84             {//把一个页面得到的图片加到泛型里
 85                 AllImgUrl.AddRange((GetImg(UrlPageSize[i])));
 86             }
 87             //把图片链接放到数据库里面。并开始下载图片
 88             //图片存储采取那个方式呢?
 89             for (int i = 0; i < AllImgUrl.Count; i++)
 90             {
 91                 string FileName = GirlsName + DateTime.Now.ToString("yyyyMMddHHmmssffff");
 92 
 93                 try
 94                 {
 95                     DownloadOneFileByURLWithWebClient(CnShuk.Common.PageValidate.InputText(FileName), AllImgUrl[i], System.Web.HttpContext.Current.Server.MapPath("img"));
 96                 }
 97                 catch (SystemException)
 98                 {
 99                     //throw;
100                     continue;
101                 }
102             }
103             return "";
104         }
105 
106 
107         /// <summary>
108         /// 图片下载方法
109         /// </summary>
110         /// <param name="fileName">文件名</param>
111         /// <param name="url">Url</param>
112         /// <param name="localPath">存放路径</param>
113         public static void DownloadOneFileByURLWithWebClient(string fileName, string url, string localPath)
114         {
115             if (fileName.Contains(":"))
116             {
117                 fileName = fileName.Replace(':', 'a');
118             }
119             System.Net.WebClient wc = new System.Net.WebClient();
120 
121             if (File.Exists(localPath + fileName))
122             {
123                 File.Delete(localPath + fileName);
124             }
125             if (Directory.Exists(localPath) == false)
126             {
127                 Directory.CreateDirectory(localPath);
128             }
129 
130             try
131             {
132                 wc.DownloadFile(url, @"c:\img\" + fileName + ".jpg");
133             }
134             catch (SystemException)
135             {
136                 throw;
137             }
138         
139         }
140 
141         /// <summary>
142         /// 取一个页面的图片
143         /// </summary>
144         /// <param name="Url">一个页面的Url(有可能是第二页)</param>
145         /// <returns>返回图片泛型</returns>
146         public static List<string> GetImg(string Url)
147         {
148             List<string> ImgUrl = new List<string>();
149             string HtmlText = GetHtmlSource(Url, Encoding.UTF8);
150 
151             if (string.IsNullOrEmpty(HtmlText.Trim()))
152             {
153                 return ImgUrl;
154             }
155             var html = new HtmlAgilityPack.HtmlDocument();
156             html.OptionFixNestedTags = true;
157             html.LoadHtml(HtmlText);
158             var document = html.DocumentNode;
159 
160             //开始取图片
161             var PageCount = document.QuerySelectorAll(".content img").Count();
162             for (int i = 0; i < PageCount; i++)
163             {
164                 var ImgUrlLink = document.QuerySelectorAll(".content img").ToArray()[i].OuterHtml;
165 
166                 Match m = Regex.Match(ImgUrlLink, @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?");
167                 if (m.Success)
168                 {
169                     if (m.ToString() != "http://babe.1626.com/wp-content/themes/elegant-box/images/transparent.gif")
170                     {
171                         ImgUrl.Add(m.ToString());
172                     }
173                 }
174             }
175             return ImgUrl;
176         }
177 
178         /// <summary>
179         /// 验证提取Url
180         /// </summary>
181         /// <param name="Url">含有Url的字符串</param>
182         /// <returns>返回Url</returns>
183         public static string MatchUrl(string Url)
184         {
185             Match m = Regex.Match(Url, @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?");
186 
187             if (m.Success)
188             {
189                 return m.ToString();
190             }
191             else
192             {
193                 return "";
194             }
195         }
196 
197         /// <summary>
198         /// 得到html
199         /// </summary>
200         /// <param name="url"></param>
201         /// <param name="charset"></param>
202         /// <returns></returns>
203         public static string GetHtmlSource(string url, Encoding charset)
204         {
205             //处理内容  
206             string html = "";
207             try
208             {
209                 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
210                 HttpWebResponse response = (HttpWebResponse)request.GetResponse();
211                 Stream stream = response.GetResponseStream();
212                 StreamReader reader = new StreamReader(stream, charset);
213                 html = reader.ReadToEnd();
214                 stream.Close();
215             }
216             catch (Exception e)
217             {
218             }
219             return html;
220         }
221     }
222 }

里面也没啥技术含量。只是自己采集这玩。我采集了3G的美女图片。只是程序不稳定。也懒得优化了。只是这里面用到了2比较不错的东西。Fizzler和HtmlAgilityPack\国内关于这资料也有。第二个的资料国外的比较多。很简单的。这个.其实这个还可以采集到后。直接插入数据库。然后导入到正式数据库里面。我比较懒。只要图片。代码烂。自己看吧.这代码下载的附件咋鸡巴上传呢。长时间没写。不知道咋搞了。这代码自己调调吧。粘贴上去就对了