From 7e6d2eb6f4236b4f04bfb7c976f135a1f33cc107 Mon Sep 17 00:00:00 2001 From: Alexandre Alapetite Date: Wed, 25 Dec 2013 14:21:29 +0100 Subject: [PATCH] =?UTF-8?q?Encore=20plus=20de=20flux=20tol=C3=A9r=C3=A9s?= =?UTF-8?q?=20avec=20leurs=20erreurs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Corrige https://github.com/marienfressinaud/FreshRSS/issues/332 --- CHANGELOG | 1 + lib/SimplePie/SimplePie.php | 13 +++++++------ lib/SimplePie/SimplePie/Parser.php | 30 ++++++++++++++++++++---------- 3 files changed, 28 insertions(+), 16 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 0c816dbd7..05d3a50ec 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -43,6 +43,7 @@ * PHP : * Meilleure gestion des caractères spéciaux dans différents cas * Amélioration des performances + * Encore plus tolérant pour les flux comportant des erreurs * Chargement automatique des classes * Alternative dans le cas d’absence de librairie JSON * Pour le développement, le cache HTTP peut être désactivé en créant un fichier “./no-cache.txt” diff --git a/lib/SimplePie/SimplePie.php b/lib/SimplePie/SimplePie.php index d20ab5430..f02037c10 100644 --- a/lib/SimplePie/SimplePie.php +++ b/lib/SimplePie/SimplePie.php @@ -1313,7 +1313,7 @@ class SimplePie // First check to see if input has been overridden. if ($this->input_encoding !== false) { - $encodings[] = $this->input_encoding; + $encodings[] = strtoupper($this->input_encoding); } $application_types = array('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity'); @@ -1330,18 +1330,18 @@ class SimplePie } else { - $encodings[] = ''; //Let the DOM parser decide first + $encodings[] = ''; //FreshRSS: Let the DOM parser decide first } } elseif (in_array($sniffed, $text_types) || substr($sniffed, 0, 5) === 'text/' && substr($sniffed, -4) === '+xml') { if (isset($headers['content-type']) && preg_match('/;\x20?charset=([^;]*)/i', $headers['content-type'], $charset)) { - $encodings[] = $charset[1]; + $encodings[] = strtoupper($charset[1]); } else { - $encodings[] = ''; + $encodings[] = ''; //FreshRSS: Let the DOM parser decide first } $encodings[] = 'US-ASCII'; } @@ -1364,13 +1364,14 @@ class SimplePie foreach ($encodings as $encoding) { // Change the encoding to UTF-8 (as we always use UTF-8 internally) - if ($utf8_data = (empty($encoding) || $encoding === 'UTF-8') ? $this->raw_data : $this->registry->call('Misc', 'change_encoding', array($this->raw_data, $encoding, 'UTF-8'))) + if ($utf8_data = (empty($encoding) || $encoding === 'UTF-8') ? $this->raw_data : //FreshRSS + $this->registry->call('Misc', 'change_encoding', array($this->raw_data, $encoding, 'UTF-8'))) { // Create new parser $parser = $this->registry->create('Parser'); // If it's parsed fine - if ($parser->parse($utf8_data, 'UTF-8')) + if ($parser->parse($utf8_data, empty($encoding) ? '' : 'UTF-8')) //FreshRSS { $this->data = $parser->get_data(); if (!($this->get_type() & ~SIMPLEPIE_TYPE_NONE)) diff --git a/lib/SimplePie/SimplePie/Parser.php b/lib/SimplePie/SimplePie/Parser.php index c4c732787..bd6c4efd8 100644 --- a/lib/SimplePie/SimplePie/Parser.php +++ b/lib/SimplePie/SimplePie/Parser.php @@ -77,6 +77,8 @@ class SimplePie_Parser public function parse(&$data, $encoding) { + $xmlEncoding = ''; + if (!empty($encoding)) { // Use UTF-8 if we get passed US-ASCII, as every US-ASCII character is a UTF-8 character @@ -121,6 +123,7 @@ class SimplePie_Parser $declaration = $this->registry->create('XML_Declaration_Parser', array(substr($data, 5, $pos - 5))); if ($declaration->parse()) { + $xmlEncoding = strtoupper($declaration->encoding); //FreshRSS $data = substr($data, $pos + 2); $data = 'version . '" encoding="' . $encoding . '" standalone="' . (($declaration->standalone) ? 'yes' : 'no') . '"?>' . $data; } @@ -132,17 +135,24 @@ class SimplePie_Parser } } - try //FreshRSS - { - $dom = new DOMDocument(); - $dom->recover = true; - $dom->strictErrorChecking = false; - $dom->loadXML($data); - $this->encoding = $encoding = $dom->encoding = 'UTF-8'; - $data = $dom->saveXML(); - } - catch (Exception $e) + if ($xmlEncoding === '' || $xmlEncoding === 'UTF-8') //FreshRSS: case of no explicit HTTP encoding, and lax UTF-8 { + try + { + $dom = new DOMDocument(); + $dom->recover = true; + $dom->strictErrorChecking = false; + $dom->loadXML($data); + $this->encoding = $encoding = $dom->encoding = 'UTF-8'; + $data2 = $dom->saveXML(); + if (strlen($data2) > (strlen($data) / 2.0)) + { + $data = $data2; + } + } + catch (Exception $e) + { + } } $return = true;