在线解析html,获得需要的url
时间:2010-10-22 来源:slddyb
public class last {
public static void main(String[] args) throws IOException, ParserException
{
int boardid=0;
int page=0;
String url;
last urls=new last();
for(int i=2;i<623;i++)
{
boardid=i+1;
for(int j=1;j<466;j++)
{
page=j+1;
url="http://www.cc98.org/list.asp?boardid="+boardid+"&page="+page;
if(urls.write_file_url(url))
break;
System.out.println("http://www.cc98.org/list.asp?boardid="+boardid+"&page="+page);
}
}
System.out.println("end");
}
public String oldurl="http://www.cc98.org/list.asp?index.asp";
public String newurl="http://www.cc98.org/list.asp?index.asp";
public boolean write_file_url(String url)throws IOException, ParserException
{
ConnectionManager manager;
FileWriter fw = new FileWriter("F://htmls/compare/1.txt", true);
BufferedWriter bw = new BufferedWriter(fw);
manager = Page.getConnectionManager();
Parser parser = new Parser(manager.openConnection(url));
parser.setEncoding("utf-8");
NodeFilter filter =new AndFilter(new TagNameFilter("a"),new HasAttributeFilter("id"));
NodeList nodelist=parser.parse(filter);
NodeIterator it=nodelist.elements();
while(it.hasMoreNodes())
{
Node node=(Node)it.nextNode();
System.out.println(node.toHtml());
bw.write(node.toHtml());
bw.newLine();
bw.flush();
}
bw.close();
fw.close();
newurl=url;
if(last.compare_html(oldurl, newurl))
{
return true;
}
else
{
oldurl=newurl;
return false;
}
}
public static boolean compare_html(String path,String path2) throws ParserException
{
ConnectionManager manager;
manager = Page.getConnectionManager();
Parser parser = new Parser(manager.openConnection(path));
Parser parser1=new Parser(manager.openConnection(path2));
parser1.setEncoding("utf-8");
parser.setEncoding("utf-8");
NodeFilter filter = new AndFilter(new TagNameFilter("div"),new HasChildFilter(new TagNameFilter("font")));
NodeList nodelist=parser.parse(filter);
NodeFilter filter1=new AndFilter(new TagNameFilter("div"),new HasChildFilter(new TagNameFilter("font")));
NodeList nodelist1=parser1.parse(filter1);
if(nodelist1.toString().isEmpty())
return true;
if(nodelist1.toString().equals(nodelist.toString()))
return true;
else
return false;
}
}
通过htmlparser 分析网页信息,获得需要的<a href> 信息,保存到本地text文件中
public static void main(String[] args) throws IOException, ParserException
{
int boardid=0;
int page=0;
String url;
last urls=new last();
for(int i=2;i<623;i++)
{
boardid=i+1;
for(int j=1;j<466;j++)
{
page=j+1;
url="http://www.cc98.org/list.asp?boardid="+boardid+"&page="+page;
if(urls.write_file_url(url))
break;
System.out.println("http://www.cc98.org/list.asp?boardid="+boardid+"&page="+page);
}
}
System.out.println("end");
}
public String oldurl="http://www.cc98.org/list.asp?index.asp";
public String newurl="http://www.cc98.org/list.asp?index.asp";
public boolean write_file_url(String url)throws IOException, ParserException
{
ConnectionManager manager;
FileWriter fw = new FileWriter("F://htmls/compare/1.txt", true);
BufferedWriter bw = new BufferedWriter(fw);
manager = Page.getConnectionManager();
Parser parser = new Parser(manager.openConnection(url));
parser.setEncoding("utf-8");
NodeFilter filter =new AndFilter(new TagNameFilter("a"),new HasAttributeFilter("id"));
NodeList nodelist=parser.parse(filter);
NodeIterator it=nodelist.elements();
while(it.hasMoreNodes())
{
Node node=(Node)it.nextNode();
System.out.println(node.toHtml());
bw.write(node.toHtml());
bw.newLine();
bw.flush();
}
bw.close();
fw.close();
newurl=url;
if(last.compare_html(oldurl, newurl))
{
return true;
}
else
{
oldurl=newurl;
return false;
}
}
public static boolean compare_html(String path,String path2) throws ParserException
{
ConnectionManager manager;
manager = Page.getConnectionManager();
Parser parser = new Parser(manager.openConnection(path));
Parser parser1=new Parser(manager.openConnection(path2));
parser1.setEncoding("utf-8");
parser.setEncoding("utf-8");
NodeFilter filter = new AndFilter(new TagNameFilter("div"),new HasChildFilter(new TagNameFilter("font")));
NodeList nodelist=parser.parse(filter);
NodeFilter filter1=new AndFilter(new TagNameFilter("div"),new HasChildFilter(new TagNameFilter("font")));
NodeList nodelist1=parser1.parse(filter1);
if(nodelist1.toString().isEmpty())
return true;
if(nodelist1.toString().equals(nodelist.toString()))
return true;
else
return false;
}
}
通过htmlparser 分析网页信息,获得需要的<a href> 信息,保存到本地text文件中
相关阅读 更多 +