Created
December 3, 2023 20:36
-
-
Save HelgeSverre/a0b294991d69120a9411f9b5ddaa76b9 to your computer and use it in GitHub Desktop.
Experimental code to compres HTML without losing the "structure", for a web scraping with ai use-case to reduce required token-usage.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace App; | |
use Illuminate\Support\Str; | |
use Symfony\Component\DomCrawler\Crawler; | |
class HtmlCompressor | |
{ | |
protected bool $removeComments = true; | |
protected bool $removeEmptyElements = true; | |
protected array $elementsToRemove = [ | |
// Elements | |
'script', 'style', 'link', 'head', 'noscript', 'template', 'footer', 'svg', 'br', 'hr', | |
// Fontawesome icons | |
'i.fa', 'i.fas', 'i.far', 'i.fal', | |
]; | |
protected array $attributesToRemove = [ | |
'alt', | |
'for', | |
'href', | |
'onclick', | |
'onerror', | |
'onsubmit', | |
'placeholder', | |
'role', | |
'src', | |
'style', | |
'tabindex', | |
]; | |
public function __construct( | |
?array $elementsToRemove = null, | |
?array $attributesToRemove = null, | |
?bool $removeComments = null, | |
?bool $removeEmptyElements = null, | |
) { | |
$this->elementsToRemove = $elementsToRemove ?? $this->elementsToRemove; | |
$this->attributesToRemove = $attributesToRemove ?? $this->attributesToRemove; | |
$this->removeComments = $removeComments ?? $this->removeComments; | |
$this->removeEmptyElements = $removeEmptyElements ?? $this->removeEmptyElements; | |
} | |
public static function compress(string $html): string | |
{ | |
return (new self())->compressHtml($html); | |
} | |
public static function textOnly(string $html): string | |
{ | |
$raw = self::compress($html); | |
$dom = new Crawler(Str::of($raw)->replace('><', '> <')->toString()); | |
return Str::of($dom->text())->squish()->trim()->toString(); | |
} | |
/** @noinspection PhpPossiblePolymorphicInvocationInspection */ | |
public function compressHtml(string $html): string | |
{ | |
$crawler = new Crawler($html); | |
// Remove elements we dont need | |
foreach ($this->elementsToRemove as $element) { | |
$crawler->filter($element)->each(function (Crawler $node) { | |
$node->getNode(0)->parentNode->removeChild($node->getNode(0)); | |
}); | |
} | |
if ($this->removeComments) { | |
$crawler->filterXPath('//comment()')->each(function (Crawler $node) { | |
$node->getNode(0)->parentNode->removeChild($node->getNode(0)); | |
}); | |
} | |
if ($this->removeEmptyElements) { | |
$crawler->filter('*')->each(function (Crawler $node) { | |
if (in_array($node->nodeName(), ['div', 'p']) && $node->text() === '') { | |
$node->getNode(0)->parentNode->removeChild($node->getNode(0)); | |
} | |
}); | |
} | |
// Remove attributes | |
$crawler->filter('*')->each(function (Crawler $node) { | |
// Need to convert the iterator to an array, | |
// because we are mutating the attributes as inside the loop. | |
foreach (iterator_to_array($node->getNode(0)->attributes) as $attribute) { | |
$name = $attribute->nodeName; | |
if (in_array($name, $this->attributesToRemove)) { | |
$node->getNode(0)->removeAttribute($name); | |
} | |
// Remove aria and data attributes | |
if (Str::startsWith($name, ['aria-', 'data-'])) { | |
$node->getNode(0)->removeAttribute($name); | |
} | |
} | |
}); | |
$filteredHtml = $crawler->filter('body')->html(); | |
$compressedHtml = Str::of($filteredHtml) | |
->squish() | |
->replace('> <', '><') // Remove whitespace between closing and ending tags "</div> <div>" | |
->replace(' </', '</') // Remove whitespace before next 'whatever </tag>' | |
->replace(' ">', '">') | |
->replace(' " ', '"') | |
->replace('> ', '>') | |
->replace(' <', '<') | |
->remove('<span>×</span>') | |
->trim(); | |
return $compressedHtml; | |
} | |
public static function classes(bool|string $html) | |
{ | |
$crawler = new Crawler($html); | |
$classes = $crawler->filter('*')->each(function (Crawler $node) { | |
return $node->attr('class'); | |
}); | |
return collect($classes)->filter()->unique()->values(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment