文章详情

  • 游戏榜单
  • 软件榜单
关闭导航
热搜榜
热门下载
热门标签
php爱好者> php文档>火车票源信息抓取系统

火车票源信息抓取系统

时间:2011-01-24  来源:G.Anthony

  一、系统功能:

        1.每隔一定时间从网络抓取一次最新的票源信息;

        2.支持根据关键字筛选票源信息;

        3.支持抓取时间间隔设置;

        4.支持票源网址链接;

  二、运行环境:

        1.NET2.0框架及以上;

        2.IE6.0及以上;

  三、实现思路:

        1.设置抓取的地址与解析的方式

 

代码
public static List<Site> GetDefaultSites()  
{
List
<Site> sites = new List<Site>();
sites
= new List<Site>();
sites.Add(
new Site()
{
Name
= "火车票网",
Url
= "http://www.huochepiao.com/city/Search.asp?leixing=%D7%AA%C8%C3&chufa=&daoda=",
RegexPattern
= @"· <A href="(.*?)" mce_href="(.*?)" target=_blank>(.*?)</a>",
Encoding
= Encoding.Default,
Keys
= new string[] { "" }
});
sites.Add(
new Site()
{
Name
= "百姓网",
Url
= "http://beijing.baixing.com/huochepiao/?%E5%8F%91%E8%BD%A6%E6%97%A5%E6%9C%9F=&%E8%BD%A6%E6%AC%A1=&%E5%87%BA%E5%8F%91%E5%9F%8E%E5%B8%82=%E5%8C%97%E4%BA%AC&%E5%88%B0%E8%BE%BE%E5%9F%8E%E5%B8%82=&wanted=1",
RegexPattern
= @""" ><a href=""/(.*?)"">(.*?)</a></td>",
Encoding
= Encoding.UTF8,
Domain
= "http://beijing.baixing.com/",
Keys
= new string[] { "" }
});
sites.Add(
new Site()
{
Name
= "赶集网",
Url
= "http://bj.ganji.com/piao/",
RegexPattern
= @"<dt><a href=""/(.*?)"" target=""_blank"">(.*?)</a></dt>",
Encoding
= Encoding.UTF8,
Domain
= "http://bj.ganji.com/",
Keys
= new string[] { "" }
});

sites.Add(
new Site()
{
Name
= "酷讯网",
Url
= "http://huoche.kuxun.cn/zhuanrang-beijing-wuhan.html",
RegexPattern
= @"<div class=""col_11 left"">(.*?)<br /><div style="padding:8px 0 0 0px;" mce_style="padding:8px 0 0 0px;"><a target='_blank' href="(.*?)" mce_href="(.*?)">",
Encoding
= Encoding.UTF8,
Domain
= "",
IsChange
= "YES"
});

return sites;
}

 

     2.抓取网页信息

 

代码
public string GetNetString(string URL, Encoding CodeTpye)  
{
string str = "";
try
{
WebClient client
= new WebClient();
byte[] pagedata = client.DownloadData(URL);
str
= CodeTpye.GetString(pagedata);
}
catch
{
}
return str ;
}

 

 

     3.解析票源信息 

代码
public class clsNetInfoParseServer  
{
private static IList<GetResult> lsList = new List<GetResult>();
public void ClearLS()
{
lsList
= new List<GetResult>();
}
private bool IsHas(string URL)
{
foreach (var item in lsList)
{
if (item.Url == URL)
{
return true;
}
}
return false;
}
public IList<GetResult> DoNetInfoParse(string strNetInfo, Site site, string[] Keys)
{
IList
<GetResult> list = new List<GetResult>();
MatchCollection mc
= Regex.Matches(strNetInfo, site.RegexPattern);
foreach (Match m in mc)
{
if (m.Success)
{
GetResult r
= new GetResult();
if (!string.IsNullOrEmpty(site.IsChange))
{
r.Content
= site.Domain + m.Groups[1].Value.Trim();
r.Url
= m.Groups[2].Value.Trim();
}
else
{
r.Url
= site.Domain + m.Groups[1].Value.Trim();
r.Content
= m.Groups[2].Value.Trim();
}
if (!IsHas( r.Url))
{
bool isContainKey = false;
if (Keys != null && Keys.Length > 0)
{
foreach (string key in Keys)
{
if (r.Content.Contains(key))
{
isContainKey
= true;
break;
}
}
}
else
{
isContainKey
= true;
}
if (!isContainKey)
continue;
r.GetDateTime
= DateTime.Now.ToString();
r.Name
= site.Name;
lsList.Add(r);
list.Add(r);
}
}
}
return list;
}
}

 

 

 

 

 

     第一次发帖,排版不规范的还请各位谅解,以后会经常发帖与大家讨论。

 

     转帖请注明出处!

相关阅读 更多 +
排行榜 更多 +
辰域智控app

辰域智控app

系统工具 下载
网医联盟app

网医联盟app

运动健身 下载
汇丰汇选App

汇丰汇选App

金融理财 下载