采集一个网站上的所有图片
时间:2011-03-02 来源:微软中国
它这个网站。看URL就知道。可以循环,然后一个页面里面有N个文章。一个文章里面又分了几个页面。所以。
采集思路。就是循环。从大范围开始循环到小范围然后取出所有的图片、
View Code1 protected void Button1_Click(object sender, EventArgs e)
2 {
3 string UrlBegin = string.Empty;//开始URL
4 for (int c = 1; c < 168; c++)//首先从第一页开始到最后一页。全部数据
5 {
6 UrlBegin = "http://babe.1626.com/page/" + c;//开始页数
7
8 string HtmlText = GetHtmlSource(UrlBegin, Encoding.UTF8);
9
10 if (string.IsNullOrEmpty(HtmlText.Trim()))
11 {
12 return;
13 }
14 var html = new HtmlAgilityPack.HtmlDocument();
15 html.OptionFixNestedTags = true;
16 html.LoadHtml(HtmlText);
17
18 var document = html.DocumentNode;
19
20 //取出当前页的所有单个文章的Url
21 List<string> PageUrlArticleLink = new List<string>();
22 List<string> TitleList = new List<string>();
23
24 var PageCount = document.QuerySelectorAll(".dialog").Count();
25
26 for (int i = 0; i < PageCount; i++)
27 {
28 var Title = document.QuerySelectorAll(".dialog .post .title").ToArray()[i].InnerText;
29 string UrlLink = document.QuerySelectorAll(".dialog .post .title").ToArray()[i].InnerHtml;
30
31 TitleList.Add(Title);
32 PageUrlArticleLink.Add(MatchUrl(UrlLink));
33 }
34
35 for (int j = 0; j < PageUrlArticleLink.Count; j++)
36 {
37 GetPageSizeUrl(PageUrlArticleLink[j]);
38 }
39 }
40 }
41
42 /// <summary>
43 ///取出一个文章的图片
44 /// </summary>
45 /// <param name="UrlArticleLink">文章Url</param>
46 /// <returns></returns>
47 public static string GetPageSizeUrl(string UrlArticleLink)
48 {
49 //UrlArticleLink = "http://babe.1626.com/pages/29436";
50 string HtmlText = GetHtmlSource(UrlArticleLink, Encoding.UTF8);
51 if (string.IsNullOrEmpty(HtmlText.Trim()))
52 {
53 return "";
54 }
55 var html = new HtmlAgilityPack.HtmlDocument();
56 html.OptionFixNestedTags = true;
57 html.LoadHtml(HtmlText);
58
59 var document = html.DocumentNode;
60
61 string GirlsName = document.QuerySelectorAll("title").ToArray()[0].InnerText;
62
63 //看是否有下一页
64 var PageSize = document.QuerySelectorAll(".flickr p").Count();
65 List<string> UrlPageSize = new List<string>();
66 UrlPageSize.Add(UrlArticleLink);
67
68 //如果有分页,则加到list里面
69 if (PageSize != 0)
70 {
71 int a = document.QuerySelectorAll(".flickr p a").Count();//读取可能有问题不知道对方用啥分页控件
72 if (a > 0)
73 {
74 for (int i = 0; i < a; i++)
75 {
76 string PageSizeUrl = document.QuerySelectorAll(".flickr p a").ToArray()[i].OuterHtml;
77 UrlPageSize.Add(MatchUrl(PageSizeUrl));
78 }
79 }
80 }
81 List<string> AllImgUrl = new List<string>();
82
83 for (int i = 0; i < UrlPageSize.Count; i++)
84 {//把一个页面得到的图片加到泛型里
85 AllImgUrl.AddRange((GetImg(UrlPageSize[i])));
86 }
87 //把图片链接放到数据库里面。并开始下载图片
88 //图片存储采取那个方式呢?
89 for (int i = 0; i < AllImgUrl.Count; i++)
90 {
91 string FileName = GirlsName + DateTime.Now.ToString("yyyyMMddHHmmssffff");
92
93 try
94 {
95 DownloadOneFileByURLWithWebClient(CnShuk.Common.PageValidate.InputText(FileName), AllImgUrl[i], System.Web.HttpContext.Current.Server.MapPath("img"));
96 }
97 catch (SystemException)
98 {
99 //throw;
100 continue;
101 }
102 }
103 return "";
104 }
105
106
107 /// <summary>
108 /// 图片下载方法
109 /// </summary>
110 /// <param name="fileName">文件名</param>
111 /// <param name="url">Url</param>
112 /// <param name="localPath">存放路径</param>
113 public static void DownloadOneFileByURLWithWebClient(string fileName, string url, string localPath)
114 {
115 if (fileName.Contains(":"))
116 {
117 fileName = fileName.Replace(':', 'a');
118 }
119 System.Net.WebClient wc = new System.Net.WebClient();
120
121 if (File.Exists(localPath + fileName))
122 {
123 File.Delete(localPath + fileName);
124 }
125 if (Directory.Exists(localPath) == false)
126 {
127 Directory.CreateDirectory(localPath);
128 }
129
130 try
131 {
132 wc.DownloadFile(url, @"c:\img\" + fileName + ".jpg");
133 }
134 catch (SystemException)
135 {
136 throw;
137 }
138
139 }
140
141 /// <summary>
142 /// 取一个页面的图片
143 /// </summary>
144 /// <param name="Url">一个页面的Url(有可能是第二页)</param>
145 /// <returns>返回图片泛型</returns>
146 public static List<string> GetImg(string Url)
147 {
148 List<string> ImgUrl = new List<string>();
149 string HtmlText = GetHtmlSource(Url, Encoding.UTF8);
150
151 if (string.IsNullOrEmpty(HtmlText.Trim()))
152 {
153 return ImgUrl;
154 }
155 var html = new HtmlAgilityPack.HtmlDocument();
156 html.OptionFixNestedTags = true;
157 html.LoadHtml(HtmlText);
158 var document = html.DocumentNode;
159
160 //开始取图片
161 var PageCount = document.QuerySelectorAll(".content img").Count();
162 for (int i = 0; i < PageCount; i++)
163 {
164 var ImgUrlLink = document.QuerySelectorAll(".content img").ToArray()[i].OuterHtml;
165
166 Match m = Regex.Match(ImgUrlLink, @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?");
167 if (m.Success)
168 {
169 if (m.ToString() != "http://babe.1626.com/wp-content/themes/elegant-box/images/transparent.gif")
170 {
171 ImgUrl.Add(m.ToString());
172 }
173 }
174 }
175 return ImgUrl;
176 }
177
178 /// <summary>
179 /// 验证提取Url
180 /// </summary>
181 /// <param name="Url">含有Url的字符串</param>
182 /// <returns>返回Url</returns>
183 public static string MatchUrl(string Url)
184 {
185 Match m = Regex.Match(Url, @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?");
186
187 if (m.Success)
188 {
189 return m.ToString();
190 }
191 else
192 {
193 return "";
194 }
195 }
196
197 /// <summary>
198 /// 得到html
199 /// </summary>
200 /// <param name="url"></param>
201 /// <param name="charset"></param>
202 /// <returns></returns>
203 public static string GetHtmlSource(string url, Encoding charset)
204 {
205 //处理内容
206 string html = "";
207 try
208 {
209 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
210 HttpWebResponse response = (HttpWebResponse)request.GetResponse();
211 Stream stream = response.GetResponseStream();
212 StreamReader reader = new StreamReader(stream, charset);
213 html = reader.ReadToEnd();
214 stream.Close();
215 }
216 catch (Exception e)
217 {
218 }
219 return html;
220 }
221 }
222 }
里面也没啥技术含量。只是自己采集这玩。我采集了3G的美女图片。只是程序不稳定。也懒得优化了。只是这里面用到了2比较不错的东西。Fizzler和HtmlAgilityPack\国内关于这资料也有。第二个的资料国外的比较多。很简单的。这个.其实这个还可以采集到后。直接插入数据库。然后导入到正式数据库里面。我比较懒。只要图片。代码烂。自己看吧.这代码下载的附件咋鸡巴上传呢。长时间没写。不知道咋搞了。这代码自己调调吧。粘贴上去就对了
相关阅读 更多 +