получить значение тега title, используя DOMDocument

Question

получить значение тега title, используя DOMDocument

я хочу получить значение <title> тег для всех страниц моего сайта. я пытаюсь запустить скрипт только на домене моего сайта и получить ссылки на все страницы на моем сайте и названия их.

Это мой код:

$html = file_get_contents('http://xxxxxxxxx.com');
//Create a new DOM document
$dom = new DOMDocument;

//Parse the HTML. The @ is used to suppress any parsing errors
//that will be thrown if the $html string isn't valid XHTML.
@$dom->loadHTML($html);

//Get all links. You could also use any other tag name here,
//like 'img' or 'table', to extract other tags.
$links = $dom->getElementsByTagName('a');

//Iterate over the extracted links and display their URLs
foreach ($links as $link){
//Extract and show the "href" attribute.
echo $link->nodeValue;
echo $link->getAttribute('href'), '<br>';
}

Что я получаю это: <a href="z1.html">z2</a> я получил z1.html а также z2….
мой z1.html иметь title названный z3, я хочу получить z1.html а также z3не z2, Может кто-нибудь мне помочь?

0

html php scrape

Решение

Другие решения

вам нужно создать свою собственную пользовательскую функцию и вызывать ее в соответствующих местах; если вам нужно получить несколько тегов со страниц, которые находятся в теге привязки, вам просто нужно создать новую пользовательскую функцию.

Ниже код поможет вам начать

$html = my_curl_function('http://www.anchorartspace.org/');
$doc = new DOMDocument();
@$doc->loadHTML($html);
$mytag = $doc->getElementsByTagName('title');
//get and display what you need:
$title = $mytag->item(0)->nodeValue;

$links = $doc->getElementsByTagName('a');

//Iterate over the extracted links and display their URLs
foreach ($links as $link) {

//Extract and show the "href" attribute.
echo $link->nodeValue;
echo "<br/>".'MY ANCHOR LINK : - ' . $link->getAttribute('href') . "---TITLE--->";

$a_html = my_curl_function($link->getAttribute('href'));
$a_doc = new DOMDocument();
@$a_doc->loadHTML($a_html);
$a_html_title = $a_doc->getElementsByTagName('title');
//get and display what you need:
$a_html_title = $a_html_title->item(0)->nodeValue;
echo $a_html_title;
echo '<br/>';
}
echo "Title: $title" . '<br/><br/>';

function my_curl_function($url) {
$curl_handle = curl_init();
curl_setopt($curl_handle, CURLOPT_URL, $url);
curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 2);
curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl_handle, CURLOPT_USERAGENT, 'name');
$html = curl_exec($curl_handle);
curl_close($curl_handle);
return $html;
}

дайте мне знать, если вам нужна дополнительная помощь

0

Источник

Accepted Answer

добавив немного в ответ hitesh, чтобы проверить, есть ли у элементов атрибуты и существует ли нужный атрибут. также, если получение элементов title действительно возвращает хотя бы один элемент, прежде чем пытаться получить первый ($ a_html_title-> item (0)).

и добавил опцию, чтобы curl следовал за местоположением (он нужен для моего жестко запрограммированного теста на google.com)

foreach ($links as $link) {

//Extract and show the "href" attribute.
if ($link->hasAttributes()){
if ($link->hasAttribute('href')){

$href = $link->getAttribute('href');
$href = 'http://google.com';   // hardcoding just for testing

echo $link->nodeValue;
echo "<br/>".'MY ANCHOR LINK : - ' . $href . "---TITLE--->";
$a_html = my_curl_function($href);

$a_doc = new DOMDocument();
@$a_doc->loadHTML($a_html);
$a_html_title = $a_doc->getElementsByTagName('title');

//get and display what you need:
if ($a_html_title->length){
$a_html_title = $a_html_title->item(0)->nodeValue;
echo $a_html_title;
echo '<br/>';
}

}
}

}

function my_curl_function($url) {
$curl_handle = curl_init();
curl_setopt($curl_handle, CURLOPT_URL, $url);
curl_setopt($curl_handle, CURLOPT_CONNECTTIMEOUT, 2);
curl_setopt($curl_handle, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl_handle, CURLOPT_USERAGENT, 'name');
curl_setopt($curl_handle, CURLOPT_FOLLOWLOCATION, TRUE);    // added this
$html = curl_exec($curl_handle);
curl_close($curl_handle);
return $html;
}

1