php html解析器

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了php html解析器相关的知识,希望对你有一定的参考价值。

  1. Class html_Parser {
  2. // Private properties
  3. var $_parser;
  4. var $_tags = array();
  5. var $_html;
  6. var $output = array();
  7. var $strXmlData;
  8. var $_level = 0;
  9. var $_outline;
  10. var $_tagcount = array();
  11. var $xml_error = false;
  12. var $xml_error_code;
  13. var $xml_error_string;
  14. var $xml_error_line_number;
  15.  
  16. function get_html () {
  17. return $this->_html;
  18. }
  19.  
  20. function parse($strInputXML) {
  21. $this->output = array();
  22.  
  23. // Translate entities
  24. $strInputXML = $this->translate_entities($strInputXML);
  25.  
  26. $this->_parser = xml_parser_create ();
  27. xml_parser_set_option($this->_parser, XML_OPTION_CASE_FOLDING, true);
  28. xml_set_object($this->_parser,$this);
  29. xml_set_element_handler($this->_parser, "tagOpen", "tagClosed");
  30.  
  31. xml_set_character_data_handler($this->_parser, "tagData");
  32.  
  33. $this->strXmlData = xml_parse($this->_parser,$strInputXML );
  34.  
  35. if (!$this->strXmlData) {
  36. $this->xml_error = true;
  37. $this->xml_error_code = xml_get_error_code($this->_parser);
  38. $this->xml_error_line_number = xml_get_current_line_number($this->_parser);
  39. return false;
  40. }
  41.  
  42. return $this->output;
  43. }
  44.  
  45.  
  46. function tagOpen($parser, $name, $attr) {
  47. // Increase level
  48. $this->_level++;
  49.  
  50. // Create tag:
  51. $newtag = $this->create_tag($name, $attr);
  52.  
  53. // Build tag
  54. $tag = array("name"=>$name,"attr"=>$attr, "level"=>$this->_level);
  55.  
  56. // Add tag
  57. array_push ($this->output, $tag);
  58.  
  59. // Add tag to this level
  60. $this->_tags[$this->_level] = $tag;
  61.  
  62. // Add to HTML
  63. $this->_html .= $newtag;
  64.  
  65. // Add to outline
  66. $this->_outline .= $this->_level . $newtag;
  67. }
  68.  
  69. function create_tag ($name, $attr) {
  70. // Create tag:
  71. # Begin with name
  72. $tag = '<' . strtolower($name) . ' ';
  73.  
  74. # Create attribute list
  75. foreach ($attr as $key=>$val) {
  76. $tag .= strtolower($key) . '="' . htmlentities($val) . '" ';
  77. }
  78.  
  79. # Finish tag
  80. $tag = trim($tag);
  81.  
  82. switch(strtolower($name)) {
  83. case 'br':
  84. case 'input':
  85. $tag .= ' /';
  86. break;
  87. }
  88.  
  89. $tag .= '>';
  90.  
  91. return $tag;
  92. }
  93.  
  94. function tagData($parser, $tagData) {
  95. if(trim($tagData)) {
  96. if(isset($this->output[count($this->output)-1]['tagData'])) {
  97. $this->output[count($this->output)-1]['tagData'] .= $tagData;
  98. } else {
  99. $this->output[count($this->output)-1]['tagData'] = $tagData;
  100. }
  101. }
  102.  
  103. $this->_html .= htmlentities($tagData);
  104. $this->_outline .= htmlentities($tagData);
  105. }
  106.  
  107. function tagClosed($parser, $name) {
  108. // Add to HTML and outline
  109. switch (strtolower($name)) {
  110. case 'br':
  111. case 'input':
  112. break;
  113. default:
  114. $this->_outline .= $this->_level . '</' . strtolower($name) . '>';
  115. $this->_html .= '</' . strtolower($name) . '>';
  116. }
  117.  
  118. // Get tag that belongs to this end
  119. $tag = $this->_tags[$this->_level];
  120. $tag = $this->create_tag($tag['name'], $tag['attr']);
  121.  
  122. // Try to get innerHTML
  123. $regex = '%' . preg_quote($this->_level . $tag, '%') . '(.*?)' . preg_quote($this->_level . '</' . strtolower($name) . '>', '%') . '%is';
  124. preg_match ($regex, $this->_outline, $matches);
  125.  
  126. // Get innerHTML
  127. if (isset($matches['1'])) {
  128. $innerhtml = $matches['1'];
  129. }
  130.  
  131. // Remove level identifiers
  132. $this->_outline = str_replace($this->_level . $tag, $tag, $this->_outline);
  133. $this->_outline = str_replace($this->_level . '</' . strtolower($name) . '>', '</' . strtolower($name) . '>', $this->_outline);
  134.  
  135. // Add innerHTML
  136. if (isset($innerhtml)) {
  137. $this->output[count($this->output)-1]['innerhtml'] = $innerhtml;
  138. }
  139.  
  140. // Fix tree
  141. $this->output[count($this->output)-2]['children'][] = $this->output[count($this->output)-1];
  142. array_pop($this->output);
  143.  
  144. // Decrease level
  145. $this->_level--;
  146. }
  147.  
  148. function translate_entities($xmlSource, $reverse =FALSE) {
  149. static $literal2NumericEntity;
  150.  
  151. if (empty($literal2NumericEntity)) {
  152. $transTbl = get_html_translation_table(HTML_ENTITIES);
  153.  
  154. foreach ($transTbl as $char => $entity) {
  155. if (strpos('&#038;"<>', $char) !== FALSE) continue;
  156. $literal2NumericEntity[$entity] = '&#'.ord($char).';';
  157. }
  158. }
  159.  
  160. if ($reverse) {
  161. return strtr($xmlSource, array_flip($literal2NumericEntity));
  162. } else {
  163. return strtr($xmlSource, $literal2NumericEntity);
  164. }
  165. }
  166. }

以上是关于php html解析器的主要内容,如果未能解决你的问题,请参考以下文章

Php Html 解析器“查找下一个”

本地服务器上的 PHP 简单 HTML DOM 解析器

PHP-XML基于流的解析器及其他常用解析器

php html解析器

缓存 PHP 简单 HTML DOM 解析器

HTML链接PHP解析器的URL