提取字符串使用PHP广告的XPath刮

Z时代
2024-01-10
分类：问答

我需要刮这个HTML页面...提取字符串使用PHP广告的XPath刮

https://www.sanita.puglia.it/monitorpo/aslfg/monitorps-web/monitorps/monitorPSperASL.do?codNazionale=160115

....使用PHP和XPath来获取值在名为“PO G.TATARELLA-CERIGNOLA”的表格下的绿色框中。

（注：你可以在页面中看到不同的价值，如果你试图浏览它......没关系.. ,,它改变dinamically ....）

我用这PHP代码示例打印的价值...

<?php 
    ini_set('display_errors', 'On'); 
    error_reporting(E_ALL); 
    $url = 'https://www.sanita.puglia.it/monitorpo/aslfg/monitorps-web/monitorps/monitorPSperASL.do?codNazionale=160115'; 
    $xpath_for_parsing = '/html/body/div[4]/table/tbody/tr[2]/td[4]/div'; 
    //#Set CURL parameters: pay attention to the PROXY config !!!! 
    $ch = curl_init(); 
    curl_setopt($ch, CURLOPT_AUTOREFERER, TRUE); 
    curl_setopt($ch, CURLOPT_HEADER, 0); 
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 
    curl_setopt($ch, CURLOPT_URL, $url); 
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE); 
    curl_setopt($ch, CURLOPT_PROXY, ''); 
    $data = curl_exec($ch); 
    curl_close($ch); 
    $dom = new DOMDocument(); 
    @$dom->loadHTML($data); 
    $xpath = new DOMXPath($dom); 
    $colorWaitingNumber = $xpath->query($xpath_for_parsing); 
    $theValue = 'N.D.'; 
    foreach($colorWaitingNumber as $node) 
    { 
     $theValue = $node->nodeValue; 
    } 
    print $theValue; 
?>

这样，我获得“ND”作为输出不是“”我想。

页面源代码如下...

在我的代码，我想不要用“绝对的XPath”所以，我试图使用像语法（我知道这是行不通的，但我是一个新手用xpath ...）

$xpath_for_parsing = '//*[div="cRiga3 boxtriageS"]';

但结果总是一样的。

任何建议/示例？

回答：

我认为以下几点应该会有所帮助 - 您需要调整XPath查询，以便定位特定的表格和特定的单元格内容，但主代码似乎可以正常工作。我怀疑原始代码的问题是URL为https，通常在进行卷曲请求时需要额外的配置设置。有curlrequest函数中的设置可以删除，我只是从另一个脚本中复制了这些设置。

改变路径以$cacert到cacert.pem您的系统上的副本或到live version on curl.haxx.se

$url = 'https://www.sanita.puglia.it/monitorpo/aslfg/monitorps-web/monitorps/monitorPSperASL.do?codNazionale=160115'; 
function _curlrequest($url=null, $options=null){ 
    $cacert='c:/wwwroot/cacert.pem'; 
    $vbh = fopen('php://temp', 'w+'); 
    $res=array(
     'response' => null, 
     'verbose' => null, 
     'info'  => array('http_code' => 100), 
     'headers' => null, 
     'errors' => null 
    ); 
    if(is_null($url)) return (object)$res; 
    session_write_close(); 
    /* Initialise curl request object */ 
    $curl=curl_init(); 
    if(parse_url($url,PHP_URL_SCHEME)=='https'){ 
     curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, true); 
     curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, 2); 
     curl_setopt($curl, CURLOPT_CAINFO, $cacert); 
    } 
    /* Define standard options */ 
    curl_setopt($curl, CURLOPT_URL,trim($url)); 
    curl_setopt($curl, CURLOPT_AUTOREFERER, true); 
    curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); 
    curl_setopt($curl, CURLOPT_FAILONERROR, true); 
    curl_setopt($curl, CURLOPT_HEADER, false); 
    curl_setopt($curl, CURLINFO_HEADER_OUT, false); 
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); 
    curl_setopt($curl, CURLOPT_BINARYTRANSFER, true); 
    curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 20); 
    curl_setopt($curl, CURLOPT_TIMEOUT, 60); 
    curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'); 
    curl_setopt($curl, CURLOPT_MAXREDIRS, 10); 
    curl_setopt($curl, CURLOPT_ENCODING, ''); 
    curl_setopt($curl,CURLOPT_VERBOSE,true); 
    curl_setopt($curl,CURLOPT_NOPROGRESS,true); 
    curl_setopt($curl,CURLOPT_STDERR,$vbh); 
    /* Assign runtime parameters as options */ 
    if(isset($options) && is_array($options)){ 
     foreach($options as $param => $value) curl_setopt($curl, $param, $value); 
    } 
    /* Execute the request and store responses */ 
    $res=(object)array(
     'response' => curl_exec($curl), 
     'info'  => (object)curl_getinfo($curl), 
     'errors' => curl_error($curl) 
    ); 
    rewind($vbh); 
    $res->verbose=stream_get_contents($vbh); 
    fclose($vbh); 
    curl_close($curl); 
    return $res; 
} 
function getdom($data=false, $debug=false){ 
    try{ 
     if(!$data)throw new Exception('No data passed whilst trying to invoke DOMDocument'); 
     libxml_use_internal_errors(true); 
     $dom = new DOMDocument(); 
     $dom->validateOnParse=false; 
     $dom->standalone=true; 
     $dom->strictErrorChecking=false; 
     $dom->recover=true; 
     $dom->formatOutput=false; 
     $dom->loadHTML($data); 
     $errors=libxml_get_errors(); 
     libxml_clear_errors(); 
     return !empty($errors) && $debug ? $errors : $dom; 
    }catch(Exception $e){ 
     echo $e->getMessage(); 
    } 
} 
$obj=_curlrequest($url); 
if($obj->info->http_code==200){ 
    $dom=getdom($obj->response); 
    $xp=new DOMXPath($dom); 
    $query='//div[ contains(@class,"cRiga3 boxtriageS") ]'; 
    $col=$xp->query($query); 
    if(!empty($col) && $col->length > 0){ 
     foreach($col as $node)echo $node->nodeValue . '<br />'; 
    } 
}

此输出

2 20 37 >1h 1 2 24 10 5 7 32 29 0 3 25 5 0 0 6 2

以上是提取字符串使用PHP广告的XPath刮的全部内容，来源链接： utcz.com/qa/257360.html

提取字符串使用PHP广告的XPath刮

回答：

其他人也看了：