提取字符串使用PHP广告的XPath刮
我需要刮这个HTML页面...提取字符串使用PHP广告的XPath刮
https://www.sanita.puglia.it/monitorpo/aslfg/monitorps-web/monitorps/monitorPSperASL.do?codNazionale=160115
....使用PHP和XPath来获取值在名为“PO G.TATARELLA-CERIGNOLA”的表格下的绿色框中。
(注:你可以在页面中看到不同的价值,如果你试图浏览它......没关系.. ,,它改变dinamically ....)
我用这PHP代码示例打印的价值...
<?php ini_set('display_errors', 'On');
error_reporting(E_ALL);
$url = 'https://www.sanita.puglia.it/monitorpo/aslfg/monitorps-web/monitorps/monitorPSperASL.do?codNazionale=160115';
$xpath_for_parsing = '/html/body/div[4]/table/tbody/tr[2]/td[4]/div';
//#Set CURL parameters: pay attention to the PROXY config !!!!
$ch = curl_init();
curl_setopt($ch, CURLOPT_AUTOREFERER, TRUE);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
curl_setopt($ch, CURLOPT_PROXY, '');
$data = curl_exec($ch);
curl_close($ch);
$dom = new DOMDocument();
@$dom->loadHTML($data);
$xpath = new DOMXPath($dom);
$colorWaitingNumber = $xpath->query($xpath_for_parsing);
$theValue = 'N.D.';
foreach($colorWaitingNumber as $node)
{
$theValue = $node->nodeValue;
}
print $theValue;
?>
这样,我获得“ND”作为输出不是“”我想。
页面源代码如下...
在我的代码,我想不要用“绝对的XPath”所以,我试图使用像语法(我知道这是行不通的,但我是一个新手用xpath ...)
$xpath_for_parsing = '//*[div="cRiga3 boxtriageS"]';
但结果总是一样的。
任何建议/示例?
回答:
我认为以下几点应该会有所帮助 - 您需要调整XPath查询,以便定位特定的表格和特定的单元格内容,但主代码似乎可以正常工作。我怀疑原始代码的问题是URL为https
,通常在进行卷曲请求时需要额外的配置设置。有curlrequest
函数中的设置可以删除,我只是从另一个脚本中复制了这些设置。
改变路径以$cacert
到cacert.pem
您的系统上的副本或到live version on curl.haxx.se
$url = 'https://www.sanita.puglia.it/monitorpo/aslfg/monitorps-web/monitorps/monitorPSperASL.do?codNazionale=160115'; function _curlrequest($url=null, $options=null){
$cacert='c:/wwwroot/cacert.pem';
$vbh = fopen('php://temp', 'w+');
$res=array(
'response' => null,
'verbose' => null,
'info' => array('http_code' => 100),
'headers' => null,
'errors' => null
);
if(is_null($url)) return (object)$res;
session_write_close();
/* Initialise curl request object */
$curl=curl_init();
if(parse_url($url,PHP_URL_SCHEME)=='https'){
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curl, CURLOPT_CAINFO, $cacert);
}
/* Define standard options */
curl_setopt($curl, CURLOPT_URL,trim($url));
curl_setopt($curl, CURLOPT_AUTOREFERER, true);
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($curl, CURLOPT_FAILONERROR, true);
curl_setopt($curl, CURLOPT_HEADER, false);
curl_setopt($curl, CURLINFO_HEADER_OUT, false);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl, CURLOPT_BINARYTRANSFER, true);
curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 20);
curl_setopt($curl, CURLOPT_TIMEOUT, 60);
curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36');
curl_setopt($curl, CURLOPT_MAXREDIRS, 10);
curl_setopt($curl, CURLOPT_ENCODING, '');
curl_setopt($curl,CURLOPT_VERBOSE,true);
curl_setopt($curl,CURLOPT_NOPROGRESS,true);
curl_setopt($curl,CURLOPT_STDERR,$vbh);
/* Assign runtime parameters as options */
if(isset($options) && is_array($options)){
foreach($options as $param => $value) curl_setopt($curl, $param, $value);
}
/* Execute the request and store responses */
$res=(object)array(
'response' => curl_exec($curl),
'info' => (object)curl_getinfo($curl),
'errors' => curl_error($curl)
);
rewind($vbh);
$res->verbose=stream_get_contents($vbh);
fclose($vbh);
curl_close($curl);
return $res;
}
function getdom($data=false, $debug=false){
try{
if(!$data)throw new Exception('No data passed whilst trying to invoke DOMDocument');
libxml_use_internal_errors(true);
$dom = new DOMDocument();
$dom->validateOnParse=false;
$dom->standalone=true;
$dom->strictErrorChecking=false;
$dom->recover=true;
$dom->formatOutput=false;
$dom->loadHTML($data);
$errors=libxml_get_errors();
libxml_clear_errors();
return !empty($errors) && $debug ? $errors : $dom;
}catch(Exception $e){
echo $e->getMessage();
}
}
$obj=_curlrequest($url);
if($obj->info->http_code==200){
$dom=getdom($obj->response);
$xp=new DOMXPath($dom);
$query='//div[ contains(@class,"cRiga3 boxtriageS") ]';
$col=$xp->query($query);
if(!empty($col) && $col->length > 0){
foreach($col as $node)echo $node->nodeValue . '<br />';
}
}
此输出
2 20
37
>1h
1
2
24
10
5
7
32
29
0
3
25
5
0
0
6
2
以上是 提取字符串使用PHP广告的XPath刮 的全部内容, 来源链接: utcz.com/qa/257360.html