Rewriten Favicon library using cURL

Reduce the number of requests, more robust, many more cases working,
reduced code
pull/1504/head
Alexandre Alapetite 8 years ago
parent 6d5fb27f97
commit 44c9ae51c4
  1. 2
      CHANGELOG.md
  2. 43
      lib/Favicon/DataAccess.php
  3. 396
      lib/Favicon/Favicon.php
  4. 23
      lib/Favicon/FaviconDLType.php
  5. 110
      lib/favicons.php
  6. 3
      p/f.php

@ -23,7 +23,7 @@
* Improve English [#1465](https://github.com/FreshRSS/FreshRSS/pull/1465)
* Misc.
* Fall back to article URL when the article GUID is empty [#1482](https://github.com/FreshRSS/FreshRSS/issues/1482)
* Update to version 1.2 of Favicon library [#1501](https://github.com/FreshRSS/FreshRSS/issues/1501)
* Rewriten Favicon library using cURL [#1503](https://github.com/FreshRSS/FreshRSS/pull/1503)
## 2017-03-11 FreshRSS 1.6.3

@ -1,43 +0,0 @@
<?php
namespace Favicon;
/**
* DataAccess is a wrapper used to read/write data locally or remotly
* Aside from SOLID principles, this wrapper is also useful to mock remote resources in unit tests
* Note: remote access warning are silenced because we don't care if a website is unreachable
**/
class DataAccess {
public function retrieveUrl($url) {
$this->set_context();
return @file_get_contents($url);
}
public function retrieveHeader($url) {
$this->set_context();
$headers = @get_headers($url, 1);
return is_array($headers) ? array_change_key_case($headers) : array();
}
public function saveCache($file, $data) {
file_put_contents($file, $data);
}
public function readCache($file) {
return file_get_contents($file);
}
private function set_context() {
stream_context_set_default(
array(
'http' => array(
'method' => 'GET',
'follow_location' => 0,
'max_redirects' => 1,
'timeout' => 10,
'header' => "User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:20.0; Favicon; +https://github.com/ArthurHoaro/favicon) Gecko/20100101 Firefox/32.0\r\n",
)
)
);
}
}

@ -1,396 +0,0 @@
<?php
namespace Favicon;
class Favicon
{
protected static $TYPE_CACHE_URL = 'url';
protected static $TYPE_CACHE_IMG = 'img';
protected $url = '';
protected $cacheDir;
protected $cacheTimeout;
protected $dataAccess;
public function __construct($args = array())
{
if (isset($args['url'])) {
$this->url = $args['url'];
}
$this->cacheDir = __DIR__ . '/../../resources/cache';
$this->cacheTimeout = 604800;
$this->dataAccess = new DataAccess();
}
/**
* Set cache settings:
* - dir: cache directory
* - timeout: in seconds
*
* @param array $args
*/
public function cache($args = array()) {
if (isset($args['dir'])) {
$this->cacheDir = $args['dir'];
}
if (!empty($args['timeout'])) {
$this->cacheTimeout = $args['timeout'];
}
}
public static function baseUrl($url, $path = false)
{
$return = '';
if (!$url = parse_url($url)) {
return FALSE;
}
// Scheme
$scheme = isset($url['scheme']) ? strtolower($url['scheme']) : null;
if ($scheme != 'http' && $scheme != 'https') {
return FALSE;
}
$return .= "{$scheme}://";
// Username and password
if (isset($url['user'])) {
$return .= $url['user'];
if (isset($url['pass'])) {
$return .= ":{$url['pass']}";
}
$return .= '@';
}
// Hostname
if( !isset($url['host']) ) {
return FALSE;
}
$return .= $url['host'];
// Port
if (isset($url['port'])) {
$return .= ":{$url['port']}";
}
// Path
if( $path && isset($url['path']) ) {
$return .= $url['path'];
}
$return .= '/';
return $return;
}
public function info($url)
{
if(empty($url) || $url === false) {
return false;
}
$max_loop = 5;
// Discover real status by following redirects.
$loop = TRUE;
while ($loop && $max_loop-- > 0) {
$headers = $this->dataAccess->retrieveHeader($url);
if (empty($headers)) {
return false;
}
$exploded = explode(' ', $headers[0]);
if( !isset($exploded[1]) ) {
return false;
}
list(,$status) = $exploded;
switch ($status) {
case '301':
case '302':
$url = isset($headers['location']) ? $headers['location'] : '';
if (is_array($url)) {
$url = end($url);
}
break;
default:
$loop = FALSE;
break;
}
}
return array('status' => $status, 'url' => $url);
}
public function endRedirect($url) {
$out = $this->info($url);
return !empty($out['url']) ? $out['url'] : false;
}
/**
* Find remote (or cached) favicon
*
* @param string $url to look for a favicon
* @param int $type type of retrieval (FaviconDLType):
* - HOTLINK_URL: returns remote URL
* - DL_FILE_PATH: returns file path of the favicon downloaded locally
* - RAW_IMAGE: returns the favicon image binary string
*
* @return string|bool favicon URL, false if nothing was found
*/
public function get($url = '', $type = FaviconDLType::HOTLINK_URL)
{
// URLs passed to this method take precedence.
if (!empty($url)) {
$this->url = $url;
}
// Get the base URL without the path for clearer concatenations.
$url = rtrim($this->baseUrl($this->url, true), '/');
$original = $url;
if (($favicon = $this->checkCache($original, self::$TYPE_CACHE_URL)) === false
&& ! $favicon = $this->getFavicon($original, false)
) {
$url = rtrim($this->endRedirect($this->baseUrl($this->url, false)), '/');
if (($favicon = $this->checkCache($url, self::$TYPE_CACHE_URL)) === false
&& ! $favicon = $this->getFavicon($url)
) {
$url = $original;
}
}
$this->saveCache($url, $favicon, self::$TYPE_CACHE_URL);
switch ($type) {
case FaviconDLType::DL_FILE_PATH:
return $this->getImage($url, $favicon, false);
case FaviconDLType::RAW_IMAGE:
return $this->getImage($url, $favicon, true);
case FaviconDLType::HOTLINK_URL:
default:
return empty($favicon) ? false : $favicon;
}
}
private function getFavicon($url, $checkDefault = true) {
$favicon = false;
if(empty($url)) {
return false;
}
// Try /favicon.ico first.
if( $checkDefault ) {
$info = $this->info("{$url}/favicon.ico");
if ($info['status'] == '200') {
$favicon = $info['url'];
}
}
// See if it's specified in a link tag in domain url.
if (!$favicon) {
$favicon = trim($this->getInPage($url));
}
if (substr($favicon, 0, 2) === '//') {
$favicon = 'https:' . $favicon;
}
// Make sure the favicon is an absolute URL.
if( $favicon && filter_var($favicon, FILTER_VALIDATE_URL) === false ) {
$favicon = $url . '/' . $favicon;
}
// Sometimes people lie, so check the status.
// And sometimes, it's not even an image. Sneaky bastards!
// If cacheDir isn't writable, that's not our problem
if ($favicon && is_writable($this->cacheDir) && extension_loaded('fileinfo') && !$this->checkImageMType($favicon)) {
$favicon = false;
}
return $favicon;
}
/**
* Find remote favicon and return it as an image
*/
private function getImage($url, $faviconUrl = '', $image = false)
{
if (empty($faviconUrl)) {
return false;
}
$favicon = $this->checkCache($url, self::$TYPE_CACHE_IMG);
// Favicon not found in the cache
if( $favicon === false ) {
$favicon = $this->dataAccess->retrieveUrl($faviconUrl);
// Definitely not found
if (!$this->checkImageMTypeContent($favicon)) {
return false;
} else {
$this->saveCache($url, $favicon, self::$TYPE_CACHE_IMG);
}
}
if( $image ) {
return $favicon;
}
else
return self::$TYPE_CACHE_IMG . md5($url);
}
/**
* Display data as a PNG Favicon, then exit
* @param $data
*/
private function displayFavicon($data) {
header('Content-Type: image/png');
header('Cache-Control: private, max-age=10800, pre-check=10800');
header('Pragma: private');
header('Expires: ' . date(DATE_RFC822,strtotime('7 day')));
echo $data;
exit;
}
private function getInPage($url) {
$html = $this->dataAccess->retrieveUrl("{$url}/");
preg_match('!<head.*?>.*</head>!ims', $html, $match);
if(empty($match) || count($match) == 0) {
return false;
}
$head = $match[0];
$dom = new \DOMDocument();
// Use error suppression, because the HTML might be too malformed.
if (@$dom->loadHTML($head)) {
$links = $dom->getElementsByTagName('link');
foreach ($links as $link) {
if ($link->hasAttribute('rel') && strtolower($link->getAttribute('rel')) == 'shortcut icon') {
return $link->getAttribute('href');
}
}
foreach ($links as $link) {
if ($link->hasAttribute('rel') && strtolower($link->getAttribute('rel')) == 'icon') {
return $link->getAttribute('href');
}
}
foreach ($links as $link) {
if ($link->hasAttribute('href') && strpos($link->getAttribute('href'), 'favicon') !== FALSE) {
return $link->getAttribute('href');
}
}
}
return false;
}
private function checkCache($url, $type) {
if ($this->cacheTimeout) {
$cache = $this->cacheDir . '/'. $type . md5($url);
if (file_exists($cache) && is_readable($cache)
&& ($this->cacheTimeout === -1 || time() - filemtime($cache) < $this->cacheTimeout)
) {
return $this->dataAccess->readCache($cache);
}
}
return false;
}
/**
* Will save data in cacheDir if the directory writable and any previous cache is expired (cacheTimeout)
* @param $url
* @param $data
* @param $type
* @return string cache file path
*/
private function saveCache($url, $data, $type) {
// Save cache if necessary
$cache = $this->cacheDir . '/'. $type . md5($url);
if ($this->cacheTimeout && !file_exists($cache)
|| (is_writable($cache) && $this->cacheTimeout !== -1 && time() - filemtime($cache) > $this->cacheTimeout)
) {
$this->dataAccess->saveCache($cache, $data);
}
return $cache;
}
private function checkImageMType($url) {
$fileContent = $this->dataAccess->retrieveUrl($url);
return $this->checkImageMTypeContent($fileContent);
}
private function checkImageMTypeContent($content) {
if(empty($content)) return false;
$isImage = true;
try {
$fInfo = finfo_open(FILEINFO_MIME_TYPE);
$isImage = strpos(finfo_buffer($fInfo, $content), 'image') !== false;
finfo_close($fInfo);
} catch (\Exception $e) {
error_log('Favicon checkImageMTypeContent error: ' . $e->getMessage());
}
return $isImage;
}
/**
* @return mixed
*/
public function getCacheDir()
{
return $this->cacheDir;
}
/**
* @param mixed $cacheDir
*/
public function setCacheDir($cacheDir)
{
$this->cacheDir = $cacheDir;
}
/**
* @return mixed
*/
public function getCacheTimeout()
{
return $this->cacheTimeout;
}
/**
* @param mixed $cacheTimeout
*/
public function setCacheTimeout($cacheTimeout)
{
$this->cacheTimeout = $cacheTimeout;
}
/**
* @return string
*/
public function getUrl()
{
return $this->url;
}
/**
* @param string $url
*/
public function setUrl($url)
{
$this->url = $url;
}
/**
* @param DataAccess|\PHPUnit_Framework_MockObject_MockObject $dataAccess
*/
public function setDataAccess($dataAccess)
{
$this->dataAccess = $dataAccess;
}
}

@ -1,23 +0,0 @@
<?php
namespace Favicon;
interface FaviconDLType
{
/**
* Retrieve remote favicon URL.
*/
const HOTLINK_URL = 0;
/**
* Retrieve downloaded favicon path (requires cache).
*/
const DL_FILE_PATH = 1;
/**
* Retrieve the image content as a binary string.
*/
const RAW_IMAGE = 2;
}

@ -1,22 +1,104 @@
<?php
include(LIB_PATH . '/Favicon/FaviconDLType.php');
include(LIB_PATH . '/Favicon/DataAccess.php');
include(LIB_PATH . '/Favicon/Favicon.php');
$favicons_dir = DATA_PATH . '/favicons/';
$default_favicon = PUBLIC_PATH . '/themes/icons/default_favicon.ico';
function download_favicon($website, $dest) {
global $default_favicon;
function isImgMime($content) {
//Based on https://github.com/ArthurHoaro/favicon/blob/3a4f93da9bb24915b21771eb7873a21bde26f5d1/src/Favicon/Favicon.php#L311-L319
if($content == '') {
return false;
}
if (!extension_loaded('fileinfo')) {
return true;
}
$isImage = true;
try {
$fInfo = finfo_open(FILEINFO_MIME_TYPE);
$isImage = strpos(finfo_buffer($fInfo, $content), 'image') !== false;
finfo_close($fInfo);
} catch (Exception $e) {
}
return $isImage;
}
syslog(LOG_INFO, 'FreshRSS Favicon discovery GET ' . $website);
$favicon_getter = new \Favicon\Favicon();
$tmpPath = realpath(TMP_PATH);
$favicon_getter->setCacheDir($tmpPath);
$favicon_getter->setCacheTimeout(-1);
$favicon_path = $favicon_getter->get($website, \Favicon\FaviconDLType::DL_FILE_PATH);
function downloadHttp(&$url, $curlOptions = array()) {
syslog(LOG_INFO, 'FreshRSS Favicon GET ' . $url);
if (substr($url, 0, 2) === '//') {
$url = 'https:' . $favicon;
}
if ($url == '' || filter_var($url, FILTER_VALIDATE_URL) === false) {
return '';
}
$ch = curl_init($url);
curl_setopt_array($ch, array(
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 10,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_TIMEOUT => 15,
CURLOPT_USERAGENT => 'FreshRSS/' . FRESHRSS_VERSION . ' (' . PHP_OS . '; ' . FRESHRSS_WEBSITE . ')',
));
if (defined('CURLOPT_ENCODING')) {
curl_setopt($ch, CURLOPT_ENCODING, ''); //Enable all encodings
}
curl_setopt_array($ch, $curlOptions);
$response = curl_exec($ch);
$info = curl_getinfo($ch);
curl_close($ch);
if (!empty($info['url']) && (filter_var($info['url'], FILTER_VALIDATE_URL) !== false)) {
$url = $info['url'];
}
return $info['http_code'] == 200 ? $response : '';
}
function searchFavicon(&$url) {
$dom = new DOMDocument();
$html = downloadHttp($url);
if ($html != '' && @$dom->loadHTML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING)) {
$rels = array('shortcut icon', 'icon');
$links = $dom->getElementsByTagName('link');
foreach ($rels as $rel) {
foreach ($links as $link) {
if ($link->hasAttribute('rel') && $link->hasAttribute('href') &&
strtolower(trim($link->getAttribute('rel'))) === $rel) {
$href = trim($link->getAttribute('href'));
if (substr($href, 0, 2) === '//') {
$href = 'https:' . $href;
}
if (filter_var($href, FILTER_VALIDATE_URL) === false) {
$href = SimplePie_IRI::absolutize($url, $href);
}
$favicon = downloadHttp($href, array(
CURLOPT_REFERER => $url,
));
if (isImgMime($favicon)) {
return $favicon;
}
}
}
}
}
return '';
}
return ($favicon_path != false && @rename($tmpPath . '/' . $favicon_path, $dest)) ||
function download_favicon($url, $dest) {
global $default_favicon;
$url = trim($url);
$favicon = searchFavicon($url);
if ($favicon == '') {
$rootUrl = preg_replace('%^(https?://[^/]+).*$%i', '$1/', $url);
if ($rootUrl != $url) {
$url = $rootUrl;
$favicon = searchFavicon($url);
}
if ($favicon == '') {
$link = $rootUrl . 'favicon.ico';
$favicon = downloadHttp($link, array(
CURLOPT_REFERER => $url,
));
if (!isImgMime($favicon)) {
$favicon = '';
}
}
}
return ($favicon != '' && file_put_contents($dest, $favicon)) ||
@copy($default_favicon, $dest);
}

@ -1,6 +1,6 @@
<?php
require('../constants.php');
require(LIB_PATH . '/lib_rss.php'); //Includes class autoloader
require(LIB_PATH . '/favicons.php');
require(LIB_PATH . '/http-conditional.php');
@ -15,7 +15,6 @@ function show_default_favicon($cacheSeconds = 3600) {
}
}
$id = isset($_SERVER['QUERY_STRING']) ? $_SERVER['QUERY_STRING'] : '0';
if (!ctype_xdigit($id)) {
$id = '0';

Loading…
Cancel
Save