本文介绍了无法在webclient中完全提取数据的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我试图从该网站提取数据,首先我试图获取类别,并从每个类别我得到子类别,然后从每个子类别我试图阅读和提取一些文本。



我正在使用webclient我面临一些奇怪的问题,有时我不会读取数据有时会读取数据。



有时我会得到30个类别,但从10个类别获得子类别有时候我会得到10个类别并使用网络客户端读取所有子类别。



如何解决这个问题?

以下是代码:

Hi , im trying to extract data from that website , first im trying to get categories and from each categories i am getting subcategories and then from each subcategory im trying to read and extract some text.

Im using webclient that im facing some strange problem that sometimes i doesnt read data sometimes it reads data.

Sometimes i get 30 categories but get subcategories from only 10 sometimes i get 10 categories and read all subcategories using web client.

how to solve this problem ?
followin is he code :

public Extract(string url)
       {

           client = new WebClient();
           strm = client.OpenRead(url);
           strrdr = new StreamReader(strm, Encoding.ASCII);
           categorylines = new List<string>();
           subcategorylines = new List<string>[30];

           while (strrdr.Peek() > 0)
           {

               string line = strrdr.ReadLine();
               line = line.Replace("\n", String.Empty);
               line = line.Replace("\t", string.Empty);
               line = line.Replace("\r", string.Empty);
               line = line.Replace("\\", "");
               //System.Threading.Thread.Sleep(100);
               ExtractLines(line);

           }
           strrdr.Close();
       }

       public void ExtractSubcategories() {
           string url = null;
           string name = null;

           for (int i = 0; i < Categories.Category.Count; i++)
           {
               foreach (var item in subcategorylines[i])
               {
                   find1 = new Regex(@"href="".+"">", RegexOptions.IgnoreCase);
                   find2 = new Regex(@">.+<\/a>", RegexOptions.IgnoreCase);
                   m1 = find1.Match(item);
                   m2 = find2.Match(item);
                   if (m1.Success)
                   {
                       url = item.Substring(m1.Index+6, m1.Length - 8);
                       url = "www.codeproject.com" + url;
                   }
                   if (m2.Success)
                   {
                       name = item.Substring(m2.Index + 1, m2.Length - 5);

                   }
                   ArticleSubCategory sub = new ArticleSubCategory(name, url);
                   Categories.Category[i].SubCategories.Add(sub);
               }


           }
       }
       public void ExtractCategory() {
               string url = null;
               string name = null;
               tblcategories = new DataTable();
               foreach (var item in categorylines)
               {
                   url = GetLine(@"href="".+"">", 6, 8, item);
                   name = GetLine(@">.+<\/a>", 1, 5, item);

                   Categories.Add(new ArticleCategory(name, url));


               }
       }

       public string GetLine(string regex, int start, int end,string line)
       {
           find1 = new Regex(regex, RegexOptions.IgnoreCase);
           m1 = find1.Match(line);
           if (m1.Success)
           {
               return line.Substring(m1.Index+start,m1.Length-end);
           }
           else return null;
       }

       //public void ExtractArticle() { }

       public void ExtractLines(string line)
       {
           string a;

           if ((a = GetLine(categoryRegex, 0, 0, line)) != null)
           {
               tmp++;
               categorylines.Add(a);
               subcategorylines[tmp] = new List<string>();
           }
           if((a=GetLine(subcategoryRegex,0,0,line))!=null)
           {

               subcategorylines[tmp].Add(a);

           }
           //find1= new Regex(categoryRegex, RegexOptions.IgnoreCase);
           //find2= new Regex(subcategoryRegex, RegexOptions.IgnoreCase);
           //m1 = find1.Match(line);
           //m2= find2.Match(line);
           //if (m1.Success)
           //{
           //    tmp = tmp + 1;
           //    categorylines.Add(m1.Value);
           //    subcategorylines[tmp] = new List<string>();
           //}
           //else
           //{
           //    if (m2.Success)
           //    {
           //        subcategorylines[tmp].Add(m2.Value);
           //    }
           //}

       }

       public void ExtractArticleMeta() {

           tmp = 0;
           string a;
           foreach (var item in Categories.Category)
           {
               articleLines[tmp] = new List<string>();
               foreach (var subcat in item.SubCategories)
               {
                   client = new WebClient();
                   strm = client.OpenRead("http://"+subcat.Url);
                   strrdr = new StreamReader(strm, Encoding.ASCII);
                   while (strrdr.Peek()>0)
                   {
                       string line = strrdr.ReadLine();
                       line = line.Replace("\n", String.Empty);
                       line = line.Replace("\t", string.Empty);
                       line = line.Replace("\r", string.Empty);
                       line = line.Replace("\\", "");
                       if ((a= GetLine(articleregex,0,0,line))!=null)
                       {
                           articleLines[tmp].Add(a);
                       }

                   }


               }
               tmp = tmp + 1;
           }

       }

推荐答案


这篇关于无法在webclient中完全提取数据的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!

10-29 06:10