当前位置: 首页 > 工具软件 > YQL > 使用案例 >

yql 提取html,YQL: html table is no longer supported

别浩漫
2023-12-01

Thank you very much for your code.

It helped me to create my own script to read those pages which I need. I never programmed PHP before, but with your code and the wisdom of the internet I could change your script to my needs.

PHP

header('Access-Control-Allow-Origin: *'); //all

$url = $_GET['url'];

if (substr($url,0,25) != "https://www.xxxx.yy") {

echo "Only https://www.xxxx.yy allowed!";

return;

}

$xpathQuery = $_GET['xpath'];

//need more hard check for security, I made only basic

function check($target_url){

$check = curl_init();

//curl_setopt( $check, CURLOPT_HTTPHEADER, array("REMOTE_ADDR: $ip", "HTTP_X_FORWARDED_FOR: $ip"));

//curl_setopt($check, CURLOPT_INTERFACE, "xxx.xxx.xxx.xxx");

curl_setopt($check, CURLOPT_COOKIEJAR, 'cookiemon.txt');

curl_setopt($check, CURLOPT_COOKIEFILE, 'cookiemon.txt');

curl_setopt($check, CURLOPT_TIMEOUT, 40000);

curl_setopt($check, CURLOPT_RETURNTRANSFER, TRUE);

curl_setopt($check, CURLOPT_URL, $target_url);

curl_setopt($check, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);

curl_setopt($check, CURLOPT_FOLLOWLOCATION, false);

$tmp = curl_exec ($check);

curl_close ($check);

return $tmp;

}

// get html

$html = check($url);

$dom = new DOMDocument();

@$dom->loadHTML($html);

// apply xpath filter

$xpath = new DOMXPath($dom);

$elements = $xpath->query($xpathQuery);

$temp_dom = new DOMDocument();

foreach($elements as $n) $temp_dom->appendChild($temp_dom->importNode($n,true));

$renderedHtml = $temp_dom->saveHTML();

// return html in json response

// json structure:

// {html: "xxxx"}

$post_data = array(

'html' => $renderedHtml

);

echo json_encode($post_data);

?>

Javascript

$.ajax({

url: "url of service",

dataType: "json",

data: { url: url,

xpath: "//*"

},

type: 'GET',

success: function() {

},

error: function(data) {

}

});

 类似资料: