php实现的一个很好用HTML解析器类可用于采集数据
网络编程 2021-07-05 09:49www.168986.cn编程入门
狼蚁网站SEO优化就是这个 HTML解析类及用法,狼蚁网站SEO优化的功能是采集.opendir.这个网站的百度收录数据,需要的朋友可以测试下
代码如下:
<?php
$oldSetting = libxml_use_internal_errors( true );
libxml_clear_errors();
/
-+-----------------------------------
|PHP5 Framework - 2011
|Web Site: .iblue.
|E-mail: mejinke@gmail.
|Date: 2012-10-12
-+-----------------------------------
@desc HTML解析器
@author jingke
/
class XF_HtmlDom
{
private $_xpath = null;
private $_nodePath = '';
public function __construct($xpath = null, $nodePath = '')
{
$this->_xpath = $xpath;
$this->_nodePath = $nodePath;
}
public function loadHtml($url)
{
ini_set('user_agent', 'Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17 –Nexus');
$content = '';
if(strpos(strtolower($url), 'http')===false)
{
$content = file_get_contents($url);
}
else
{
$ch = curl_init();
$user_agent = "Baiduspider+(+http://.baidu./search/spider.htm)";
$user_agent1='Mozilla/5.0 (Windows NT 5.1; rv:6.0) Gecko/20100101 Firefox/6.0';
curl_set($ch, CURLOPT_URL, $url);
curl_set($ch, CURLOPT_HEADER, false);
curl_set($ch, CURLOPT_RETURNTRANSFER, 1);
curl_set($ch, CURLOPT_REFERER, $url);
curl_set($ch, CURLOPT_USERAGENT, $user_agent1);
curl_set($ch, CURLOPT_FOLLOWLOCATION,1);
$content =curl_exec($ch);
curl_close($ch);
}
$html = new DOMDocument();
$html->loadHtml($content);
$this->_xpath = new DOMXPath( $html );
//return $this;
}
public function find($query, $index = null)
{
if($this->_nodePath == '')
$this->_nodePath = '//';
else
$this->_nodePath .= '/';
$nodes = $this->_xpath->query($this->_nodePath.$query);
//echo $nodes->item(0)->getNodePath();exit;
if ($index == null && !is_numeric($index))
{
$tmp = array();
foreach ($nodes as $node)
{
$tmp[] = new XF_HtmlDom($this->_xpath, $node->getNodePath());
}
return $tmp;
}
return new XF_HtmlDom($this->_xpath,$this->_xpath->query($this->_nodePath.$query)->item($index)->getNodePath());
}
/
获取内容
/
public function text()
{
if ($this->_nodePath != '' && $this->_xpath != null )
return $this->_xpath->query($this->_nodePath)->item(0)->textContent;
else
return false;
}
/
获取属性值
/
public function getAttribute($name)
{
if ($this->_nodePath != '' && $this->_xpath != null )
return $this->_xpath->query($this->_nodePath)->item(0)->getAttribute($name);
else
return false;
}
public function __get($name)
{
if($name == 'innertext')
return $this->text();
else
return $this->getAttribute($name);
}
}
$xp = new xf_HtmlDom();
$xp->loadHtml('http://.aizhan./siteall/.opendir./');
$rows = $xp->find("td[@id='baidu']/a", 0)->innertext;
print_r($rows);
编程语言
- 宿迁百度关键词排名指南:实现精准营销的关键
- 四川SEO优化怎么做网络推广
- 立昂技术备案老域名收购:如何为您的业务赋能
- 安徽百度关键词seo贵不贵,一般需要多少钱
- 吉林百度快照排名怎么做电话营销
- 多伦新手做SEO怎么做
- 甘肃优化关键词排名推广怎么做论坛营销
- 沙雅SEO网站推广:提升您的在线可见性
- 四川SEO优化如何提升销售额和销售量
- 聂荣网站排名优化:提升网站可见性的全方位指
- 涞水SEO:提升地方企业在线可见性的策略
- 辽宁百度seo排名怎样做网站排名
- 临湘哪有关键词排名优化:提升网站可见度的关
- 黑龙江百度网站优化有没有优惠
- 凉城优化关键词排名推广:提升您的网络可见性
- 萝北整站优化:提升您网站流量和排名的全面指