<?php mb_internal_encoding("UTF-8"); // encoding for mbstring include 'phpQuery-onefile.php'; ?> <!doctype html> <html> <head> <meta charset="utf-8"> <title>DOM</title> </head> <body> <?php $doc = phpQuery::newDocumentFileHTML("wiki.html");
$doc["table"]->remove(); // remove tables as well as their contents $doc[":header"]->remove(); // remove <h1>-<h6> $doc["#coordinates"]->remove(); // remove the float layer 'coordinates' $doc["span.editsection"]->remove(); // remove Links 'edit' $doc["small"]->remove(); // remove <span> but keep its content foreach ($doc["span"] as $span) { $span = pq($span); $span->after($span->text())->remove(); } $doc["div"]->remove(); $doc["script"]->remove();
// remove [number] foreach ($doc["a"] as $a) { $a = pq($a); if (preg_match("/^\\[\\d+\\]$/", trim($a->text()))) { $a->remove(); } }
$doc->html($doc->text()); // remove all other html labels
$html = $doc->html(); $html = preg_replace("/\\n{2,}/", "\n", $html); // \n\n\n\n\n => \n
define("MAXLEN", 400); $shortened = false; if (mb_strlen($html) > MAXLEN) { $html = mb_substr($html, 0, MAXLEN); $shortened = true; }
$html = preg_replace("/^[\\n\\s]+/", "", $html); // remove \n\s\n\s\n\s at the beginning $html = preg_replace("/[\\n\\s]+$/", "", $html); // remove \n\s\n\s\n\s at the end $html = preg_replace("/\\n/", "<br>", $html); // \n => <br> if ($shortened) { $html .= "..."; } $doc->html($html);
echo $doc->html(); ?> </body> </html>
|