网页登陆,网页采集基础类
时间:2010-09-08 来源:qianlifeng
昨天在博客园上看到了关于秒杀的文章,通过模拟网页数据传送的方式达到更快的网页访问操作!有些时候模拟网页访问还是蛮有用的,比如自动登陆,网页采集等等。下面的几个方法就是我用到的几个类,虽然方法比较少,但基本的要求还是能满足的。
代码public static class HtmlHelper
{
/// <summary>
/// 获得基础流
/// </summary>
/// <param name="uri">网址</param>
/// <param name="cc">cookie容器,可以为NULL</param>
/// <returns></returns>
public static Stream GetBaseStream(string uri, CookieContainer cc)
{
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri); //用指定Uri创建一个request
if (cc != null)
{
request.CookieContainer = cc;
}
//浏览器欺骗
request.ContentType = "application/x-www-form-urlencoded";
request.Accept = @"application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
request.UserAgent = @"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.9 Safari/533.2 ChromePlus/1.3.9.0";
HttpWebResponse response = (HttpWebResponse)request.GetResponse(); //根据创建的request得到响应response
Stream responseStream = response.GetResponseStream(); //创建一个流来获得响应体
return responseStream;
}
catch (Exception ex)
{
MessageBox.Show(@"操作失败:" + ex.Message);
return null;
}
}
/// <summary>
/// 获得网页
/// </summary>
/// <param name="uri">网址</param>
/// <param name="postDate"></param>
/// <param name="cc">cookie容器,可以为null</param>
/// <param name="encoding">网页编码</param>
/// <returns></returns>
public static string GetHtmlString(string uri, string postDate, CookieContainer cc, Encoding encoding)
{
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri);
request.ContentType = "application/x-www-form-urlencoded";
request.AllowAutoRedirect = true;
request.Accept = "application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
request.UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.9 Safari/533.2 ChromePlus/1.3.9.0";
request.CookieContainer = cc; //设置request产生cookie的容器
if (postDate != null)
{
request.Method = "Post";
byte[] byterequest = Encoding.UTF8.GetBytes(postDate);
request.ContentLength = byterequest.Length;
using (Stream stream = request.GetRequestStream())
{
stream.Write(byterequest, 0, byterequest.Length);
}
}
using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
{
using (Stream responsestream = response.GetResponseStream())
{
StreamReader sr = new StreamReader(responsestream, encoding);
string html = sr.ReadToEnd();
return html;
}
}
}
catch (Exception ex)
{
MessageBox.Show(@"发生错误:" + ex.Message);
return null;
}
}
/// <summary>
/// 从字符串中返回匹配多个的集合值(网页抽取特定部分有效)
/// </summary>
/// <param name="start">开始html tag</param>
/// <param name="end">结束html tag</param>
/// <param name="html">html</param>
/// <returns></returns>
public static List<string> GetStrings(string start, string end, string html)
{
List<string> list = new List<string>();
try
{
string pattern = string.Format("{0}(?<g>(.|[\r\n])+?){1}", start, end);//匹配URL的模式,并分组 //理解这个正则
MatchCollection mc = Regex.Matches(html, pattern);//满足pattern的匹配集合
if (mc.Count != 0)
{
foreach (Match match in mc)
{
GroupCollection gc = match.Groups;
list.Add(gc["g"].Value);
}
}
}
catch { }
return list;
}
}
相关阅读 更多 +