Amazon Scraper Script работает на XAMPP Windows, но не на PHP5 Cli в Linux

Я пытаюсь очистить коды Amazon ASIN, используя следующий код:

<?php

class Scraper {

const BASE_URL = "http://www.amazon.com";
private $categoryFile = "";
private $outputFile = "";
private $catArray;
private $currentPage = NULL;
private $asin = array();
private $categoriesMatched = 0;
private $categoryProducts = array();
private $pagesMatched = 0;
private $totalPagesMatched = 0;
private $productsMatched = 0;

public function __construct($categoryFile, $outputFile) {

$this->categoryFile = $categoryFile;
$this->outputFile = $outputFile;

}

public function run() {

$this->readCategories($this->categoryFile);
$this->setupASINArray($this->asin);

$x = 1;

foreach ($this->catArray as $cat) {

$this->categoryProducts["$x"] = 0;

if ($this->currentPage == NULL) {

$this->currentPage = $cat;
$this->scrapeASIN($this->currentPage, $x);
$this->pagesMatched++;

}

if ($this->getNextPageLink($this->currentPage)) {

do {

// next page found
$this->pagesMatched++;
$this->scrapeASIN($this->currentPage, $x);

} while ($this->getNextPageLink($this->currentPage));

}

echo "Category complete: $this->pagesMatched Pages" . "\n";
$this->totalPagesMatched += $this->pagesMatched;
$this->pagesMatched = 0;
$this->writeASIN($this->outputFile, $x);
$x++;
$this->currentPage = NULL;
$this->categoriesMatched++;}

$this->returnStats();}

private function readCategories($categoryFile) {

$catArray = file($categoryFile, FILE_IGNORE_NEW_LINES);

$this->catArray = $catArray;

}

private function setupASINArray($asinArray) {

$x = 0;

foreach ($this->catArray as $cat) {

$asinArray["$x"][0] = "$cat";
$x++;

}

$this->asin = $asinArray;

}

private function getNextPageLink($currentPage) {

$document = new DOMDocument();

$html = file_get_contents($currentPage);

@$document->loadHTML($html);

$xpath = new DOMXPath($document);

$element = $xpath->query("//a[@id='pagnNextLink']/@href");

if ($element->length != 0) {

$this->currentPage = self::BASE_URL . $element->item(0)->value;
return true;

} else {

return false;

}}

private function scrapeASIN($currentPage, $catNo) {

$html = file_get_contents($currentPage);

$regex = '~(?:www\.)?ama?zo?n\.(?:com|ca|co\.uk|co\.jp|de|fr)/(?:exec/obidos/ASIN/|o/|gp/product/|(?:(?:[^"\'/]*)/)?dp/|)(B[A-Z0-9]{9})(?:(?:/|\?|\#)(?:[^"\'\s]*))?~isx';

preg_match_all($regex, $html, $asin);

foreach ($asin[1] as $match) {

$this->asin[$catNo-1][] = $match;

}}

private function writeASIN($outputFile, $catNo) {

$fh = fopen($outputFile, "a+");

$this->fixDupes($catNo);
$this->productsMatched += (count($this->asin[$catNo-1]) - 1);
$this->categoryProducts["$catNo"] = (count($this->asin[$catNo-1]) - 1);

flock($fh, LOCK_EX);

$x = 0;

foreach ($this->asin[$catNo-1] as $asin) {

fwrite($fh, "$asin" . "\n");

$x++;

}flock($fh, LOCK_UN);

fclose($fh);

$x -= 1;

echo "$x ASIN codes written to file" . "\n";

}

private function fixDupes($catNo) {

$this->asin[$catNo-1] = array_unique($this->asin[$catNo-1], SORT_STRING);

}

public function returnStats() {

echo "Categories matched: " . $this->categoriesMatched . "\n";
echo "Pages parsed: " . $this->totalPagesMatched . "\n";
echo "Products parsed: " . $this->productsMatched . "\n";
echo "Category breakdown:" . "\n";

$x = 1;

foreach ($this->categoryProducts as $catProds) {

echo "Category $x had $catProds products" . "\n";
$x++;

}

}

}

$scraper = new Scraper($argv[1], $argv[2]);
$scraper->run();

?>

Но он отлично работает на XAMPP в Windows, но не в Linux. Есть идеи, почему это может быть? Иногда он очищает 0 ASIN для файла, иногда он очищает только 1 страницу в категории более 400 страниц. Но вывод / функциональность полностью в Windows / XAMPP.

Любые мысли будут с благодарностью!

ура
— Брайс

0

Решение

Поэтому попробуйте изменить этот способ, просто чтобы избежать сообщений об ошибках:

private function readCategories($categoryFile) {

if (file_exists($categoryFile)) {
$catArray = file($categoryFile, FILE_IGNORE_NEW_LINES);

$this->catArray = $catArray;
} else {
echo "File ".$categoryFile.' not exists!';
$this->catArray = array();
}

}
0

Другие решения

Других решений пока нет …

По вопросам рекламы [email protected]