C# Html格式内容转Csv内容包括table(重点在rowspan和colspan合并),p,div元素

Posted Util6 优六系统

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了C# Html格式内容转Csv内容包括table(重点在rowspan和colspan合并),p,div元素相关的知识,希望对你有一定的参考价值。

html格式内容转Csv内容,包括table(重点在rowspan和colspan合并),p,div元素,table不能包含嵌套功能。

  1 /// <summary>
  2 /// Html格式内容转Csv内容包括table(重点在rowspan和colspan合并),p,div元素
  3 /// </summary>
  4 /// <param name="hrml"></param>
  5 /// <returns></returns>
  6 private string HtmlToCsv(string hrml)
  7 {
  8     HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
  9     doc.LoadHtml(hrml);
 10     StringBuilder sbLines = new StringBuilder();
 11     HtmlAgilityPack.HtmlNodeCollection tList = doc.DocumentNode.SelectNodes("//table");
 12     if (tList != null)
 13     {
 14         foreach (HtmlAgilityPack.HtmlNode table in tList)
 15         {
 16             sbLines.AppendLine("#flag_table#,");
 17             HtmlAgilityPack.HtmlNodeCollection rows = table.SelectNodes("//tr");
 18             if (rows != null)
 19             {
 20                 int colCount = 0;
 21                 StringBuilder sbTable = new StringBuilder();
 22                 foreach (HtmlAgilityPack.HtmlNode td in rows[0].ChildNodes.Where(m => m.OriginalName.ToLower() == "td"))
 23                 {
 24                     HtmlAgilityPack.HtmlAttribute attr = td.Attributes["colspan"];
 25                     int colspan = (attr != null) ? int.Parse(attr.Value) : 1;
 26                     colCount = colCount + colspan;
 27                 }
 28                 int rowCount = rows.Count;
 29 
 30                 string[][] arr = new string[rowCount][];
 31                 for (int r = 0; r < rowCount; r++)
 32                 {
 33                     arr[r] = new string[colCount];
 34                 }
 35 
 36                 //填充区域
 37                 for (int r = 0; r < rowCount; r++)
 38                 {
 39                     HtmlAgilityPack.HtmlNode tr = rows[r];
 40                     List<HtmlAgilityPack.HtmlNode> cols = tr.ChildNodes.Where(m => m.OriginalName.ToLower() == "td").ToList();
 41 
 42                     int colspan = 0;
 43                     int rowspan = 0;
 44                     for (int c = 0; c < cols.Count; c++)
 45                     {
 46                         HtmlAgilityPack.HtmlAttribute cAttr = cols[c].Attributes["colspan"];
 47                         colspan = (cAttr != null) ? int.Parse(cAttr.Value) : 1;
 48                         HtmlAgilityPack.HtmlAttribute rAttr = cols[c].Attributes["rowspan"];
 49                         rowspan = (rAttr != null) ? int.Parse(rAttr.Value) : 1;
 50                         string text = cols[c].InnerText.Replace("&nbsp;", "").Replace(",", "").Replace("\\r", "").Replace("\\n", "").Trim();
 51 
 52                         if (colspan == 1 && rowspan == 1)
 53                         {
 54                             continue;
 55                         }
 56 
 57                         bool isFirst = true;
 58                         int rFill = r + rowspan;
 59                         for (int ri = r; ri < rFill; ri++)
 60                         {
 61                             int cFill = c + colspan;
 62                             for (int ci = c; ci < cFill; ci++)
 63                             {
 64                                 if (isFirst)
 65                                 {
 66                                     text = (text == string.Empty) ? " " : text;
 67                                     arr[ri][ci] = text;
 68                                     isFirst = false;
 69                                 }
 70                                 else
 71                                 {
 72                                     arr[ri][ci] = string.Empty;
 73                                 }
 74                             }
 75                         }
 76                     }
 77                 }
 78 
 79                 //填充单元
 80                 for (int r = 0; r < rowCount; r++)
 81                 {
 82                     HtmlAgilityPack.HtmlNode tr = rows[r];
 83                     List<HtmlAgilityPack.HtmlNode> cols = tr.ChildNodes.Where(m => m.OriginalName.ToLower() == "td").ToList();
 84                     Queue<string> queue = new Queue<string>();
 85                     for (int c = 0; c < cols.Count; c++)
 86                     {
 87                         string text = cols[c].InnerText.Replace("&nbsp;", "").Replace(",", "").Replace("\\r", "").Replace("\\n", "").Trim();
 88                         queue.Enqueue(text);
 89                     }
 90                     for (int c = 0; c < colCount; c++)
 91                     {
 92                         if (arr[r][c] == null)
 93                         {
 94                             string text = queue.Count > 0 ? queue.Dequeue() : string.Empty;
 95                             arr[r][c] = text;
 96                         }
 97                         else
 98                         {
 99                             if (arr[r][c] != string.Empty)
100                             {
101                                 if (queue.Count > 0)
102                                 {
103                                     queue.Dequeue();
104                                 }
105                             }
106                         }
107                     }
108                 }
109 
110                 //组装成cvs格式内容
111                 foreach (string[] cols in arr)
112                 {
113                     foreach (string col in cols)
114                     {
115                         sbLines.Append(col + ",");
116                     }
117                     sbLines.AppendLine(",");
118                 }
119                 table.RemoveAll();
120             }
121         }
122     }
123 
124     HtmlAgilityPack.HtmlNodeCollection pList = doc.DocumentNode.SelectNodes("//p");
125     if (pList != null)
126     {
127         sbLines.AppendLine("#flag_text#,");
128         foreach (HtmlAgilityPack.HtmlNode p in pList)
129         {
130             string text = p.InnerText.Replace("&nbsp;", "").Replace(",", "").Replace("\\r", "").Replace("\\n", "").Trim();
131             text = GetTextByHtml(text);
132             if (!string.IsNullOrWhiteSpace(text))
133             {
134                 sbLines.Append(text + ",");
135                 sbLines.AppendLine(",");
136             }
137             else
138             {
139                 sbLines.AppendLine(",");
140             }
141             p.RemoveAll();
142         }
143     }
144 
145     HtmlAgilityPack.HtmlNodeCollection dList = doc.DocumentNode.SelectNodes("//div");
146     if (pList != null)
147     {
148         sbLines.AppendLine("#flag_text#,");
149         foreach (HtmlAgilityPack.HtmlNode div in pList)
150         {
151             string text = div.InnerText.Replace("&nbsp;", "").Replace(",", "").Replace("\\r", "").Replace("\\n", "").Trim();
152             text = GetTextByHtml(text);
153             if (!string.IsNullOrWhiteSpace(text))
154             {
155                 sbLines.Append(text + ",");
156                 sbLines.AppendLine(",");
157             }
158             else
159             {
160                 sbLines.AppendLine(",");
161             }
162             //div.RemoveAll();
163         }
164     }
165     return sbLines.ToString();
166 }

 

html: 

 

csv:

 

url:http://www.cnblogs.com/dreamman/p/5343924.html

 

以上是关于C# Html格式内容转Csv内容包括table(重点在rowspan和colspan合并),p,div元素的主要内容,如果未能解决你的问题,请参考以下文章

如何将json的数据转化成csv的数据格式

C#获取CSV文件内容对逗号和引号分隔的处理

[转]C# 将类的内容写成JSON格式的字符串

如何使用 PHP 读取 CSV 文件并在 Table/DIV 中显示内容?

使用 Python 在 CSV 文件中编写完全相同的内容

C# datagrid tab分割的csv 编码格式 UTF-16格式