下载CU中Linux某板块的所有文章

时间：2008-10-28 来源：newred

下载CU中Linux某板块的所有文章 #!c:\perl\bin\perl.exe -w
use strict;
use LWP::Simple;

   # 获取所有文章列表页面，存入@index_url    #243是最后一页，web看出来的没有写自动的    # <a href='index_232.shtml'>最后页</a>
   my $url_t = 'http://linux.chinaunix.net/***/***/index_';
   my @index_url ={};
   my $index_url="";
   my $i =1;
   $index_url[0] = "http://linux.chinaunix.net/***/***/index.shtml";
   for ($i = 1; $i<243; $i++)         {
    $index_url[$i] = $url_t."$i.shtml";
        }    my $url="";
   my $url_head="http://linux.chinaunix.net/***/***/";
   my $location = "e:\\mysoft\\perl\\linuxDoc\\";
   my $t = 1;
   my $webdoc="";
   my $j=1; # 每个列表页面上有30篇文章
   foreach $index_url(@index_url)
{
my $webdoc = get $index_url;
$j=1;
print "正在处理 $index_url","\n";
    while ($webdoc =~ m#(\/\d{4}\/\d{2}\/\d{2}\/\d*\.shtml)(.*?)14px\">(.*?)<#sgi)
    {

  print $j,":$1---$3:";
  $j++;
  my $url=$url_head.$1;
  my $file=$3.".html";

  my $code = getstore($url,$file);
  if (is_error($code))
           {
       my $code= getstore($url,$t.".html");
       $t+=1;
              print "--succeed 3--\n";
# $t   针对文件名中有特殊字符，改用递增数字命名文件
    }
  else
  {
              print "--succeed 1--\n";
  }     } } exit; # $webdoc 里是网页内容，下面是对此内容作些分析： # ~ m#(\/\d{4}\/\d{2}\/\d{2}\/\d*\.shtml)#sgi)
# \/ 匹配'/'
# \d 匹配数字 \d{4} 匹配4个数字 \d* 匹配0个或多个数字
# \. 匹配'.'
# 第一个圆括号对应 $1
# /2008/07/10/1016009.shtml # ~ m#14px\">(.*?)<#sgi) sgi后面的)是while对应的
# \" 匹配 "
# .* 匹配任意字符
# .*?< 匹配任意字符直到预见第一个< ?是非贪婪模式，没有? 将匹配到最后一个<
# 14px">浅谈Linux优化及安全配置个人体会总结</a> # ~ m#(\/\d{4}\/\d{2}\/\d{2}\/\d*\.shtml)(.*?)14px\">(.*?)<#sgi)     #下载243页，大约7千多文件，耗时1个多小时，这7千多个文件中有同名字文件，最后下载下来不到7千个文件

相关阅读更多 +