将 html 放入一个庞大的数组中
Posted
技术标签:
【中文标题】将 html 放入一个庞大的数组中【英文标题】:Get html into a massive array 【发布时间】:2013-06-16 18:26:45 【问题描述】:我有一个小项目,我必须保存整个网站的 DOM,然后进行一些操作,例如获取 css 计算样式等。(这将使用 jquery 完成)
目前我已经编写了几个函数来通过 cURL 获取 html 标记。有什么办法可以将此 DOM 保存到多维数组中?递归?
目前我正在使用这个https://code.google.com/p/php-html2array/(我使用的是1.01版),它工作得很好,只是由于某种原因它缺少一些元素,比如主要的“body”标签..和其他关键元素。即时测试的url是:http://www.vulytrampolines.com/
有人可以告诉我这是如何做到的,或者我可以如何使用 Google 代码编辑 php 以查看 body 标签?
到目前为止我的代码 (Pastebin):
<?php
/**
* Website Layout Checker
*
* @package
* @author Marais Rossouw (marais.r@vulytrampolines.com)
* @copyright Vuly
* @version 2013
* @access public
*/
require_once '../setup.php';
ini_set('max_execution_time', 6000);
class layout
private $_LAYOUT, $_URL, $_DOC, $_LAYOUT_ARRAY, $_SAVE_TO_JSON, $_SAVE_TO_HTML, $_HTML_BODY;
private $_CONSOLE = array();
public function __construct($url)
// Get's the contents of the page specified.
try
$client = new Zend_Http_Client;
$client->setUri($url);
$client->setConfig(array('strictredirects' => true, 'maxredirects' => 10, 'timeout' => 8));
$response = $client->request();
$this->_LAYOUT = $response->getBody();
$this->_URL = $url;
catch (Exception $e)
$this->consoleLog($e);
// Creates a DOMDocument
try
$this->_INIT();
catch (Exception $e)
$this->consoleLog($e);
// Save the files
try
file_put_contents($this->_SAVE_TO_JSON, json_encode($this->_LAYOUT_ARRAY));
file_put_contents($this->_SAVE_TO_HTML, $this->_LAYOUT);
$this->consoleLog("The JSON file was saved to: " . $this->_SAVE_TO_JSON);
$this->consoleLog("The HTML file was saved to: " . $this->_SAVE_TO_HTML);
catch (Exception $e)
$this->consoleLog($e);
private function _INIT()
$doc = new DOMDocument();
libxml_use_internal_errors(true);
$doc->loadHTML($this->_LAYOUT);
$this->_DOC = new DOMXpath($doc);
$this->consoleLog("DOMDocument created");
$parser = new htmlParser($this->_LAYOUT);
$this->_LAYOUT_ARRAY = $parser->toArray();
//var_dump($this->_LAYOUT_ARRAY[0]['childNodes']);exit;
$this->consoleLog("Dom array created");
$this->consoleLog("There are " . count($this->_LAYOUT_ARRAY, COUNT_RECURSIVE) . " elements in the dom array");
$this->_FILE_NAME = "VULY_LAYOUT_CHECKER-" . sha1(htmlspecialchars(trim($this->_URL)) . date("Ymd") . rand(99, 9999));
$this->_FILE_PATH = "layout_checker\\"/*sys_get_temp_dir() . "\\"*/;
$this->_SAVE_TO_JSON = $this->_FILE_PATH . $this->_FILE_NAME . ".txt";
$this->_SAVE_TO_HTML = $this->_FILE_PATH . $this->_FILE_NAME . ".html";
libxml_use_internal_errors(false);
public function toString()
return $this->_LAYOUT;
public function getBody()
$this->recurse($this->_LAYOUT_ARRAY);
return $this->_HTML_BODY;
private function recurse($file)
if ($this->_HTML_BODY != false) return;
for ($i = 0; $i < count($file); $i++)
if ($file[$i]['childNodes'])
if ($file[$i]['tag'] == "body")
$this->_HTML_BODY = $file[$i]['innerHTML'];
return;
else
$this->recurse($file[$i]['childNodes']);
public function getJSON_FILE()
return file_get_contents($this->_SAVE_TO_JSON);
public function get_SAVE_TO_HTML()
return $this->_SAVE_TO_HTML;
public function consoleLog($string)
$this->_CONSOLE[] = $string;
public function renderConsole()
$return = "";
$_PAD_SIZE = strlen(count($this->_CONSOLE)) + 2;
foreach ($this->_CONSOLE as $key => $value)
$return .= str_pad($key . ":", $_PAD_SIZE) . $value . "\n";
return $return;
class htmlParser
//your very own separator
//do not enter characters such as < or >
private $separator = '~';
//the tags that don't have any innerHTML in them
//feel free to add some if I missed any
private $singleTags = 'meta|img|hr|br|link|!--|!DOCTYPE|input';
//-- Don't edit below this --
private $html,$level;
public $levelArray;
function __construct($html='')
$this->html=$this->removeWhiteSpace($html);
$this->level=-1;
$this->levelArray=array();
function __destruct()
//nothing yet;
private function getElement($value)
$ar = explode($this->separator,$value);
$ar = explode('-',$ar[1]);
return $this->levelArray[$ar[0]][$ar[1]];
private function parseToHTML($str,$level)
$ar=$this->getArrayOfReplacements($str);
foreach ($ar as $item)
$elem = $this->getElement($item);
$str=str_replace($item,($level==0?$elem['htmlText']:'<'.$elem['tag'].$elem['attr'].'>'.$elem['htmlText'].'</'.$elem['tag'].'>'),$str);
return $str;
private function replaceSingleTags()
//tags like img, input etc
$result=preg_match_all('/<('.$this->singleTags.')(.[^><]*)?>/is', $this->html, $m);
if ($result>0)
foreach ($m[0] as $id => $value)
$this->html = str_replace($value,'',$this->html);
private function replaceSimpleTags()
//tags that only have text in them (no other content)
$result=preg_match_all('/<(.[^\s]*)(.[^><]*)?>(.[^<]*)?<\/\1>/is', $this->html, $m);
if ($result>0)
$this->level++;
$oneLevel=array();
foreach ($m[0] as $id => $value)
if ($this->level==0) $htmlText=$value;
else $htmlText=$this->parseToHTML($m[3][$id],$this->level-1);
$oneLevel []= array('str' => $value, 'rep' => $this->separator.$this->level.'-'.$id.$this->separator, 'tag' => $m[1][$id], 'level' => $this->level, 'text' => $m[3][$id], 'attr' => $m[2][$id] , 'htmlText' => $htmlText);
$this->html = str_replace($value,$this->separator.$this->level.'-'.$id.$this->separator,$this->html);
$this->levelArray [$this->level] = $oneLevel;
private function replaceRemainingTags()
//tags that remain after everything
$result=preg_match_all('/<(.[^\s]*)(.[^><]*)?>(.*)?<\/\1>/is', $this->html, $m);
if ($result>0)
$this->level++;
$oneLevel=array();
foreach ($m[0] as $id => $value)
if ($this->level==0) $htmlText=$m[3][$id];
else $htmlText=$this->parseToHTML($m[3][$id],$this->level-1);
$oneLevel []= array('str' => $value, 'rep' => $this->separator.$this->level.'-'.$id.$this->separator, 'tag' => $m[1][$id], 'level' => $this->level, 'text' => $m[3][$id], 'attr' => $m[2][$id] , 'htmlText' => $htmlText);
$this->html = str_replace($value,$this->separator.$this->level.'-'.$id.$this->separator,$this->html);
$this->levelArray [$this->level] = $oneLevel;
private function existSimpleTags()
$result=preg_match('/<(.[^\s]*)(.[^><]*)?>(.[^<]*)?<\/\1>/is', $this->html);
return $result>0;
private function existSingleTags()
$result=preg_match('/<('.$this->singleTags.')(.[^><]*)?>/is', $this->html);
return $result>0;
private function removeWhiteSpace ($string)
$string = str_replace(array("\n","\r",' ',"\t"),'',$string);
return preg_replace('| +|', ' ', $string);
public function toArray($html='')
//first part: coding
if ($html!='')
$this->html = $this->removeWhiteSpace($html);
while ($this->existSimpleTags() || $this->existSingleTags())
$this->replaceSingleTags();
$this->replaceSimpleTags();
$this->replaceRemainingTags();
//now decoding
$ar=$this->getArray($this->html);
return $ar;
private function getArrayOfReplacements($str)
$final=array();
$ar=explode($this->separator,$str);
for ($i=0;$i<(count($ar)-1)/2;$i++)
$final []= $this->separator.$ar[$i*2+1].$this->separator;
return $final;
private function startsWithText($str)
$first=substr(trim(str_replace(array("\n","\r"),'',$str)),0,1);
if ($first=='<' || $first=='>') return false;
return true;
private function strInArray($array,$str)
foreach ($array as $item)
if (strpos($str,$item)!==false)
return true;
return false;
private function getArray($html, $father='')
$final=array();
if (strpos($html,$this->separator)!==false)
$r=$this->getArrayOfReplacements($html);
foreach ($r as $i)
$ar = explode($this->separator,$i);
$ar = explode('-',$ar[1]);
$elem = $this->levelArray[$ar[0]][$ar[1]];
$this->levelArray[$ar[0]][$ar[1]]['father'] = $father;
$final []= array( 'tag' => $elem['tag'], 'innerHTML' => $elem['htmlText'], 'repl' => $elem['rep'],'stratr' => $elem['attr'], 'level' => $elem['level'], 'father' => $father, 'childNodes' => $this->getArray($elem['text'],$i));
return $final;
public function loadNode($rep)
$elem = $this->getElement($rep);
return array( 'tag' => $elem['tag'], 'innerHTML' => $elem['htmlText'], 'repl' => $elem['rep'],'stratr' => $elem['attr'], 'level' => $elem['level'], 'father' => $elem['father']);
if (isset($_REQUEST['layout']))
$layout = new layout($_REQUEST['layout']);
$console = $layout->renderConsole();
$json_file = $layout->getJSON_FILE();
$toString = $layout->toString();
$getBody = "http://" . $_SERVER['SERVER_NAME']."/etramp/scripts/" . $layout->get_SAVE_TO_HTML();
else
$console = "";
$json_file = "";
$toString = "";
$getBody = "";
?>
<html>
<head>
<title>Vuly Layout Checker</title>
<style type="text/css">
html
height: 100%;
margin:0;padding:0;
body
background: #728eaa;
background: -moz-linear-gradient(top, #25303C 0%, #728EAA 100%);
background: -webkit-gradient(linear, left top, left bottom, color-stop(0%, #25303C), color-stop(100%, #728EAA));
font-family: sans-serif;
input, select
padding:10px;
select, input[type='submit']
cursor:pointer;
label
color: #fff;
padding-right: 10px;
form
margin: 50px auto 0 auto;
width: 684px;
.text1
width:49%; height:220px; resize: none; position:fixed; top:150px;
.text2
width:49%; resize: none; position:fixed; top: 380px; bottom:10px; height: 58%;
</style>
<script src="//ajax.googleapis.com/ajax/libs/jquery/1.10.1/jquery.min.js"></script>
<script type="text/javascript">
var file;
$( document ).ready(function()
file = <?php echo $json_file; ?>;
recurse(file);
);
function recurse(file)
console.log(file[i].tag);
for (var i = 0; i < file.length; i++)
if (file[i].childNodes)
if (file[i].tag == "body")
console.log($(file[i].innerHeml, $('#NEW_LAYOUT').contents()));
alert(file[i].tag);
else
recurse(file[i].childNodes);
</script>
</head>
<body>
<textarea class="text1" style="left:10px;"><?php echo $console; ?></textarea>
<textarea class="text1" style="right:10px;"><?php echo $json_file; ?></textarea>
<form>
<label for="layout">Website URL:</label>
<input type="text" name="layout" id="layout" style="width: 500px" value="<?php echo (isset($_REQUEST['layout'])) ? $_REQUEST['layout'] : "http://"; ?>">
<input type="submit">
</form>
<textarea class="text2" style="left:10px;"><?php echo $toString; ?></textarea>
<iframe id="NEW_LAYOUT" class="text2" style="right:10px;" src="<?php echo $getBody; ?>"></iframe>
</body>
</html>
【问题讨论】:
因为我必须将其保存为 JSON,然后由 jQuery 读取以获取每个元素的计算样式。最终我想制作一个应用程序,将网站的输出与以前的保存进行比较。 请联系该库的供应商以获取您的支持选项。我敢打赌,如果出现问题,也可能存在缺陷——无论是在文档中还是在代码中——因此应该将原作者放入循环中。还要注意 *** 是关于具体的编程问题的,你只是在让我们解决你的问题,即没有真正分析代码并告诉我们问题所在的代码行以及问题所在,而没有让该库在单个资源上工作问题是。 【参考方案1】:我会推荐类似 phpQuery 的东西 https://code.google.com/p/phpquery/
所以你要做的就是将该 URL 卷曲,然后将结果传递给 phpQuery,如下所示:
phpQuery::selectDocument($doc);
作为一个例子,这里是如何遍历 HTML // 最后选择的 DOM 中的所有 LIs
foreach(pq('li') as $li)
// iteration returns PLAIN dom nodes, NOT phpQuery objects
$tagName = $li->tagName;
$childNodes = $li->childNodes;
// so you NEED to wrap it within phpQuery, using pq();
pq($li)->addClass('my-second-new-class');
【讨论】:
我不只是想要 li,我基本上想要 dom 中的所有内容都在 1 个数组中。到目前为止,这是我的代码。 pastebin.com/CKzSH04a(您可能可以删除顶部的 ZEND 内容。)以上是关于将 html 放入一个庞大的数组中的主要内容,如果未能解决你的问题,请参考以下文章
源代码将一个整数的每位数分解并按逆序放入一个数组中(用递归算法)(C语言实现)
如何将 SAFEARRAY(字节数组)放入 HTML 隐藏字段