- FileSystemManager.php refactoring of all write/read actions on filesystem Deleted: - old ToolbarManager.php
191 lines
6.8 KiB
PHP
191 lines
6.8 KiB
PHP
<?php
|
|
|
|
namespace App\Service\Scraper;
|
|
|
|
use App\Entity\Chapter;
|
|
use App\Entity\ContentSource;
|
|
use Exception;
|
|
use GuzzleHttp\Exception\GuzzleException;
|
|
use Symfony\Component\Panther\Client as PantherClient;
|
|
|
|
class JavascriptScraper extends AbstractScraper
|
|
{
|
|
/**
|
|
* @throws Exception
|
|
*/
|
|
public function scrapeChapter(Chapter $chapter, ContentSource $contentSource): array|bool
|
|
{
|
|
$manga = $chapter->getManga();
|
|
$pantherClient = PantherClient::createChromeClient();
|
|
$chapterUrl = $this->getValidChapterUrl($contentSource, $manga, $chapter->getNumber());
|
|
|
|
if (!$chapterUrl) {
|
|
throw new Exception("Aucune URL valide trouvée pour le chapitre {$chapter->getNumber()} du manga {$manga->getTitle()}");
|
|
}
|
|
|
|
$pantherClient->request('GET', $chapterUrl);
|
|
|
|
try {
|
|
$this->selectChapter($pantherClient, $chapter, $contentSource);
|
|
|
|
$pageData = $contentSource->getNextPageSelector() === null
|
|
? $this->scrapeVerticalReaderJavascript($pantherClient, $contentSource, $chapter)
|
|
: $this->scrapeHorizontalReaderJavascript($pantherClient, $contentSource, $chapter);
|
|
|
|
$tempDir = sys_get_temp_dir() . '/' . uniqid('manga_scraper_');
|
|
mkdir($tempDir);
|
|
|
|
// Télécharger et sauvegarder les images
|
|
foreach ($pageData as $index => &$page) {
|
|
$imageName = sprintf('%03d.%s', $index + 1, pathinfo(parse_url($page['image_url'], PHP_URL_PATH), PATHINFO_EXTENSION));
|
|
$imagePath = $tempDir . '/' . $imageName;
|
|
|
|
$this->downloadAndSaveImage($page['image_url'], $imagePath);
|
|
$this->dispatchProgressEvent($chapter, $index + 1, count($pageData));
|
|
|
|
$page['local_image_url'] = $imagePath;
|
|
}
|
|
|
|
$cbzFilePath = $this->generateCbzPath($manga, $chapter);
|
|
$this->createCbzFile($pageData, $cbzFilePath);
|
|
|
|
$chapter->setCbzPath($cbzFilePath);
|
|
$this->entityManager->persist($chapter);
|
|
$this->entityManager->flush();
|
|
|
|
$this->cleanupTempFiles($tempDir);
|
|
|
|
return $pageData;
|
|
} finally {
|
|
$pantherClient->close();
|
|
}
|
|
}
|
|
|
|
public function testScraping(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array
|
|
{
|
|
$chapterUrl = $contentSource->getChapterUrl($mangaSlug, $chapterNumber);
|
|
|
|
if (!$this->isChapterUrlValid($chapterUrl)) {
|
|
throw new \Exception("Invalid URL, check format and slug");
|
|
}
|
|
|
|
$pantherClient = PantherClient::createChromeClient();
|
|
$pantherClient->request('GET', $chapterUrl);
|
|
|
|
try {
|
|
$chapter = new Chapter();
|
|
$chapter->setNumber((float)$chapterNumber);
|
|
|
|
$this->selectChapter($pantherClient, $chapter, $contentSource);
|
|
|
|
return $contentSource->getNextPageSelector() === null
|
|
? $this->scrapeVerticalReaderJavascript($pantherClient, $contentSource, $chapter)
|
|
: $this->scrapeHorizontalReaderJavascript($pantherClient, $contentSource, $chapter);
|
|
} catch (Exception $e) {
|
|
throw $e;
|
|
} finally {
|
|
$pantherClient->close();
|
|
}
|
|
}
|
|
|
|
public function supports(string $scrapingType): bool
|
|
{
|
|
return $scrapingType === 'javascript';
|
|
}
|
|
|
|
private function selectChapter(PantherClient $pantherClient, Chapter $chapter, ContentSource $contentSource): void
|
|
{
|
|
$chapterSelector = $contentSource->getChapterSelector();
|
|
if (!$chapterSelector) {
|
|
return;
|
|
}
|
|
|
|
$crawler = $pantherClient->waitFor($chapterSelector);
|
|
$select = $crawler->filter($chapterSelector);
|
|
|
|
if ($select->count() > 0) {
|
|
$chapterNumber = $chapter->getNumber();
|
|
$options = $select->filter('option');
|
|
$targetIndex = null;
|
|
|
|
foreach ($options as $index => $option) {
|
|
if (preg_match("/\b{$chapterNumber}\b/", $option->getText())) {
|
|
$targetIndex = $index;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if ($targetIndex !== null) {
|
|
$pantherClient->executeScript("
|
|
var select = document.querySelector('$chapterSelector');
|
|
select.selectedIndex = $targetIndex;
|
|
select.dispatchEvent(new Event('change'));
|
|
");
|
|
|
|
$this->waitForImagesLoaded($pantherClient, $contentSource);
|
|
} else {
|
|
throw new Exception("Chapitre $chapterNumber non trouvé dans le menu déroulant");
|
|
}
|
|
}
|
|
}
|
|
|
|
private function waitForImagesLoaded(PantherClient $pantherClient, ContentSource $contentSource): void
|
|
{
|
|
$imageSelector = $contentSource->getImageSelector();
|
|
$pantherClient->wait(30)->until(
|
|
function ($driver) use ($imageSelector) {
|
|
return $driver->executeScript("
|
|
return new Promise((resolve) => {
|
|
let lastImageCount = 0;
|
|
let stableCount = 0;
|
|
const stableThreshold = 10;
|
|
|
|
function checkImages() {
|
|
const images = document.querySelectorAll('$imageSelector');
|
|
const loadedImages = Array.from(images).filter(img => img.complete && img.naturalWidth > 0);
|
|
|
|
if (loadedImages.length === lastImageCount) {
|
|
stableCount++;
|
|
} else {
|
|
stableCount = 0;
|
|
lastImageCount = loadedImages.length;
|
|
}
|
|
|
|
if (stableCount >= stableThreshold) {
|
|
resolve(true);
|
|
} else {
|
|
setTimeout(checkImages, 200);
|
|
}
|
|
}
|
|
|
|
checkImages();
|
|
});
|
|
");
|
|
}
|
|
);
|
|
}
|
|
|
|
private function scrapeVerticalReaderJavascript(PantherClient $pantherClient, ContentSource $contentSource, Chapter $chapter): array
|
|
{
|
|
$pageData = [];
|
|
$crawler = $pantherClient->waitFor($contentSource->getImageSelector());
|
|
$images = $crawler->filter($contentSource->getImageSelector());
|
|
|
|
foreach ($images as $index => $image) {
|
|
$imageUrl = $image->getAttribute('src') ?: $image->getAttribute('data-src');
|
|
$pageData[] = [
|
|
'image_url' => $this->cleanImageUrl($imageUrl),
|
|
'page_number' => $index + 1,
|
|
];
|
|
}
|
|
|
|
return $pageData;
|
|
}
|
|
|
|
private function scrapeHorizontalReaderJavascript(PantherClient $pantherClient, ContentSource $contentSource, Chapter $chapter): array
|
|
{
|
|
$pageData = [];
|
|
return $pageData;
|
|
}
|
|
}
|