火车票源信息抓取系统
时间:2011-01-24 来源:G.Anthony
一、系统功能:
1.每隔一定时间从网络抓取一次最新的票源信息;
2.支持根据关键字筛选票源信息;
3.支持抓取时间间隔设置;
4.支持票源网址链接;
二、运行环境:
1.NET2.0框架及以上;
2.IE6.0及以上;
三、实现思路:
1.设置抓取的地址与解析的方式
代码
public static List<Site> GetDefaultSites()
{
List<Site> sites = new List<Site>();
sites = new List<Site>();
sites.Add(new Site()
{
Name = "火车票网",
Url = "http://www.huochepiao.com/city/Search.asp?leixing=%D7%AA%C8%C3&chufa=&daoda=",
RegexPattern = @"· <A href="(.*?)" mce_href="(.*?)" target=_blank>(.*?)</a>",
Encoding = Encoding.Default,
Keys = new string[] { "卧" }
});
sites.Add(new Site()
{
Name = "百姓网",
Url = "http://beijing.baixing.com/huochepiao/?%E5%8F%91%E8%BD%A6%E6%97%A5%E6%9C%9F=&%E8%BD%A6%E6%AC%A1=&%E5%87%BA%E5%8F%91%E5%9F%8E%E5%B8%82=%E5%8C%97%E4%BA%AC&%E5%88%B0%E8%BE%BE%E5%9F%8E%E5%B8%82=&wanted=1",
RegexPattern = @""" ><a href=""/(.*?)"">(.*?)</a></td>",
Encoding = Encoding.UTF8,
Domain = "http://beijing.baixing.com/",
Keys = new string[] { "卧" }
});
sites.Add(new Site()
{
Name = "赶集网",
Url = "http://bj.ganji.com/piao/",
RegexPattern = @"<dt><a href=""/(.*?)"" target=""_blank"">(.*?)</a></dt>",
Encoding = Encoding.UTF8,
Domain = "http://bj.ganji.com/",
Keys = new string[] { "卧" }
});
sites.Add(new Site()
{
Name = "酷讯网",
Url = "http://huoche.kuxun.cn/zhuanrang-beijing-wuhan.html",
RegexPattern = @"<div class=""col_11 left"">(.*?)<br /><div style="padding:8px 0 0 0px;" mce_style="padding:8px 0 0 0px;"><a target='_blank' href="(.*?)" mce_href="(.*?)">",
Encoding = Encoding.UTF8,
Domain = "",
IsChange = "YES"
});
return sites;
}
2.抓取网页信息
代码
public string GetNetString(string URL, Encoding CodeTpye)
{
string str = "";
try
{
WebClient client = new WebClient();
byte[] pagedata = client.DownloadData(URL);
str = CodeTpye.GetString(pagedata);
}
catch
{
}
return str ;
}
3.解析票源信息
代码public class clsNetInfoParseServer
{
private static IList<GetResult> lsList = new List<GetResult>();
public void ClearLS()
{
lsList = new List<GetResult>();
}
private bool IsHas(string URL)
{
foreach (var item in lsList)
{
if (item.Url == URL)
{
return true;
}
}
return false;
}
public IList<GetResult> DoNetInfoParse(string strNetInfo, Site site, string[] Keys)
{
IList<GetResult> list = new List<GetResult>();
MatchCollection mc = Regex.Matches(strNetInfo, site.RegexPattern);
foreach (Match m in mc)
{
if (m.Success)
{
GetResult r = new GetResult();
if (!string.IsNullOrEmpty(site.IsChange))
{
r.Content = site.Domain + m.Groups[1].Value.Trim();
r.Url = m.Groups[2].Value.Trim();
}
else
{
r.Url = site.Domain + m.Groups[1].Value.Trim();
r.Content = m.Groups[2].Value.Trim();
}
if (!IsHas( r.Url))
{
bool isContainKey = false;
if (Keys != null && Keys.Length > 0)
{
foreach (string key in Keys)
{
if (r.Content.Contains(key))
{
isContainKey = true;
break;
}
}
}
else
{
isContainKey = true;
}
if (!isContainKey)
continue;
r.GetDateTime = DateTime.Now.ToString();
r.Name = site.Name;
lsList.Add(r);
list.Add(r);
}
}
}
return list;
}
}
第一次发帖,排版不规范的还请各位谅解,以后会经常发帖与大家讨论。
转帖请注明出处!
相关阅读 更多 +