大神们帮我看下为什么这样子我采集不到文章呢?一远行就报错-C#论坛-求助专区-C#论坛-C#教程-IBC编程社区

asd7298183 发表于 2014-5-2 13:56:38

大神们帮我看下为什么这样子我采集不到文章呢?一远行就报错

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;

namespace code20
{
class Program
{
   static void Main(string[] args)
   {

         Console.Write("1:抓取，2:处理:");
         if (Console.ReadLine() == "1")
         {

            string url = "http://www.admin5.com/article/20121228/481070.shtml";
            string html=gethtml(url, Encoding.Default);

            //Regex r = new Regex("(?<=<title>).*?(?=</title>)");//实例化一个正则
            //MatchCollection co=r.Matches(html);//匹配所有项返回一个集合
            //Console.WriteLine("标题：" + co.Value);//通过索引加value获取到内容

            //Regex rcontent = new Regex("<div class=\"content\">[\\s\\S]*?</div>");
            //MatchCollection cocontent = rcontent.Matches(html);
            //Console.WriteLine("内容："+cocontent.Value);

            string listurl = "http://www.admin5.com/browse/177/";

            string listhtml = gethtml(listurl, Encoding.Default);
            //http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?
            Regex rlist = new Regex("(?<=href=\").*?(?=\")");

            MatchCollection co = rlist.Matches(listhtml);

            for (int i = 0; i < co.Count; i++)
            {

               if (co.Value.ToString().Contains("article"))
               {

                     Console.WriteLine("http://www.admin5.com/" + co.Value);
                     Console.WriteLine("抓取内容....");
                     string contenthtml = gethtml("http://www.admin5.com/" + co.Value, Encoding.Default);

                     Regex r = new Regex("(?<=<title>).*?(?=</title>)");//实例化一个正则
                     MatchCollection cotitlt = r.Matches(contenthtml);//匹配所有项返回一个集合

                     // Console.WriteLine("标题：" + cotitlt.Value);//通过索引加value获取到内容

                     Regex rcontent = new Regex("<div class=\"content\">[\\s\\S]*?</div>");
                     MatchCollection cocontent = rcontent.Matches(contenthtml);
                     // Console.WriteLine("内容：" + cocontent.Value);
                     string title = cotitlt.Value;
                     string content = cocontent.Value;
                     Console.WriteLine("保存数据...");
                     string appdir = Directory.GetCurrentDirectory();
                     if (!Directory.Exists(appdir + "\\data"))
                     {
                        Directory.CreateDirectory(appdir + "\\data");

                     }

                     File.WriteAllText(appdir + "\\data" + "\\" + i + ".txt", title + "\r\n" + content);
                     Console.WriteLine("保存成功!");

               }

            }

            Console.ReadLine();

         }
         else
         {

            string appdir = Directory.GetCurrentDirectory();

            string [] files= Directory.GetFiles(appdir+"\\data");//获取data里面所有的文件

            foreach(string filename in files)//遍历所有文件名
            {
               Console.WriteLine(filename);

               string html = File.ReadAllText(filename,Encoding.UTF8);//读取内容
               string title = html.Remove(html.IndexOf('\n')); //提取标题
               string content = html.Replace(title, "");//替换掉内容中的标题，提取出内容
                  title = title.Remove(title.LastIndexOf('-'));//处理标题
               Console.Write(title);

               Regex r = new Regex("(?<=href=\").*?(?=\")");

               MatchCollection con= r.Matches(content);
               for (int i = 0; i < con.Count;i++ )
               {

                  string url = con.Value;
                  string newurl =url.Replace( "www.admin5.com","www.shouyu.com");
                  content= content.Replace(url,newurl);

               }
      content = content.Replace("<div class=\"content\">","").Replace("</div>","");

               Console.WriteLine("保存...");
               File.Delete(filename);
               File.WriteAllText(filename, title + "\r\n" + content, Encoding.UTF8);

               Console.ReadLine();
            }

            Console.ReadLine();

         }

   }
   /// <summary>
   /// 根据url和编码获取html内容
   /// </summary>
   /// <param name="url">完整链接带http</param>
   /// <param name="enc">编码</param>
   /// <returns>字符串，html代码</returns>
   public static string gethtml(string url,Encoding enc)
   {

         WebClient myweb = new WebClient();//实例化一个WebClient连接

         Stream stream = myweb.OpenRead(url);//根据指定的url获取流

         StreamReader sr = new StreamReader(stream,enc);//从流中用utf8编码实例化一个读取器

         string html = sr.ReadToEnd();//从流中读取数据得到字符串

         return html;

   }
}
}

ibcadmin 发表于 2014-5-2 19:09:14

报什么错贴上来

页: [1]

IBC编程社区-C#论坛-C#教程,.NET教程-.NET源码's Archiver

大神们帮我看下为什么这样子我采集不到文章呢?一远行就报错