Added:
- ContentSource handling in message - ContentSource list, add/update ui - nextPageSelector and imageSelector can be null - cleanup
This commit is contained in:
@@ -166,6 +166,21 @@ class MangaScraperService
|
||||
return json_decode(implode("", $output), true);
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws GuzzleException
|
||||
*/
|
||||
public function testScrapingHtml(string $mangaSlug, string $chapterNumber, ContentSource $contentSource): array
|
||||
{
|
||||
$chapterUrl = $contentSource->getChapterUrl($mangaSlug, $chapterNumber);
|
||||
$html = $this->fetchHtml($chapterUrl);
|
||||
|
||||
if ($contentSource->getNextPageSelector() === null) {
|
||||
return $this->scrapeVerticalReader($html, $contentSource);
|
||||
} else {
|
||||
return $this->scrapeHorizontalReader($chapterUrl, $contentSource);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws GuzzleException
|
||||
*/
|
||||
@@ -173,32 +188,32 @@ class MangaScraperService
|
||||
{
|
||||
$chapterUrl = $mangaSource->getChapterUrl($manga->getSlug(), $chapter->getNumber());
|
||||
|
||||
$pageData = [];
|
||||
$currentPageUrl = $chapterUrl;
|
||||
|
||||
$tempDir = sys_get_temp_dir() . '/' . uniqid('manga_scraper_');
|
||||
mkdir($tempDir);
|
||||
|
||||
do {
|
||||
$html = $this->fetchHtml($currentPageUrl);
|
||||
$page = $this->extractMangaPageData($html, $mangaSource);
|
||||
$pageData = [];
|
||||
|
||||
$imageName = sprintf('%03d.%s', count($pageData) + 1, pathinfo(parse_url($page['image_url'], PHP_URL_PATH), PATHINFO_EXTENSION));
|
||||
if ($mangaSource->getNextPageSelector() === null) {
|
||||
// Lecteur vertical
|
||||
$html = $this->fetchHtml($chapterUrl);
|
||||
$pageData = $this->scrapeVerticalReader($html, $mangaSource);
|
||||
} else {
|
||||
// Lecteur horizontal (paginé)
|
||||
$pageData = $this->scrapeHorizontalReader($chapterUrl, $mangaSource);
|
||||
}
|
||||
|
||||
// Télécharger et sauvegarder les images
|
||||
foreach ($pageData as $index => &$page) {
|
||||
$imageName = sprintf('%03d.%s', $index + 1, pathinfo(parse_url($page['image_url'], PHP_URL_PATH), PATHINFO_EXTENSION));
|
||||
$imagePath = $tempDir . '/' . $imageName;
|
||||
|
||||
$this->downloadAndSaveImage($page['image_url'], $imagePath);
|
||||
|
||||
$event = new PageScrappingProgressEvent($chapter->getId(), count($pageData) + 1, 0);
|
||||
$event = new PageScrappingProgressEvent($chapter->getId(), $index + 1, count($pageData));
|
||||
$this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME);
|
||||
|
||||
$pageData[] = [
|
||||
'image_url' => $page['image_url'],
|
||||
'local_image_url' => $imagePath,
|
||||
'page_number' => count($pageData) + 1,
|
||||
];
|
||||
|
||||
$currentPageUrl = $page['next_page_url'];
|
||||
} while ($currentPageUrl);
|
||||
$page['local_image_url'] = $imagePath;
|
||||
}
|
||||
|
||||
$cbzFilePath = $this->generateCbzPath($manga, $chapter);
|
||||
$this->createCbzFile($tempDir, $pageData, $cbzFilePath);
|
||||
@@ -210,7 +225,78 @@ class MangaScraperService
|
||||
// Nettoyage du répertoire temporaire
|
||||
$this->cleanupTempFiles($tempDir);
|
||||
|
||||
return true;
|
||||
return $pageData;
|
||||
}
|
||||
|
||||
private function scrapeVerticalReader(string $html, ContentSource $contentSource): array
|
||||
{
|
||||
$crawler = new Crawler($html);
|
||||
$images = $crawler->filter($contentSource->getImageSelector());
|
||||
|
||||
$pageData = [];
|
||||
foreach ($images as $index => $image) {
|
||||
if($image->getAttribute('src') === ''){
|
||||
$imgUrl = $image->getAttribute('data-src');
|
||||
}else{
|
||||
$imgUrl = $image->getAttribute('src');
|
||||
}
|
||||
$pageData[] = [
|
||||
'image_url' => $this->cleanImageUrl($imgUrl),
|
||||
'page_number' => $index + 1,
|
||||
];
|
||||
}
|
||||
|
||||
return $pageData;
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws GuzzleException
|
||||
*/
|
||||
private function scrapeHorizontalReader(string $chapterUrl, ContentSource $contentSource): array
|
||||
{
|
||||
$pageData = [];
|
||||
$currentPageUrl = $chapterUrl;
|
||||
|
||||
do {
|
||||
$html = $this->fetchHtml($currentPageUrl);
|
||||
$page = $this->extractMangaPageData($html, $contentSource);
|
||||
|
||||
$pageData[] = [
|
||||
'image_url' => $this->cleanImageUrl($page['image_url']),
|
||||
'page_number' => count($pageData) + 1,
|
||||
];
|
||||
|
||||
$currentPageUrl = $page['next_page_url'];
|
||||
} while ($currentPageUrl);
|
||||
|
||||
return $pageData;
|
||||
}
|
||||
|
||||
/**
|
||||
* Processes a single image
|
||||
* @throws GuzzleException
|
||||
*/
|
||||
private function processImage(string $imgUrl, string $tempDir, array &$pageData, int $index, Chapter $chapter): void
|
||||
{
|
||||
$imgUrl = $this->cleanImageUrl($imgUrl);
|
||||
$imageName = sprintf('%03d.%s', $index + 1, pathinfo(parse_url($imgUrl, PHP_URL_PATH), PATHINFO_EXTENSION));
|
||||
$imagePath = $tempDir . '/' . $imageName;
|
||||
|
||||
$this->downloadAndSaveImage($imgUrl, $imagePath);
|
||||
|
||||
// $event = new PageScrappingProgressEvent($chapter->getId(), $index + 1, 0);
|
||||
// $this->eventDispatcher->dispatch($event, PageScrappingProgressEvent::NAME);
|
||||
|
||||
$pageData[] = [
|
||||
'image_url' => $imgUrl,
|
||||
'local_image_url' => $imagePath,
|
||||
'page_number' => $index + 1,
|
||||
];
|
||||
}
|
||||
|
||||
private function cleanImageUrl(string $url): string
|
||||
{
|
||||
return preg_replace('/[\x00-\x1F\x7F]/', '', trim($url));
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user