178 lines
5.1 KiB
PHP
178 lines
5.1 KiB
PHP
|
<?php
|
||
|
/**
|
||
|
* Copyright (c) 2014 Lukas Reschke <lukas@owncloud.com>
|
||
|
* This file is licensed under the Affero General Public License version 3 or
|
||
|
* later.
|
||
|
* See the COPYING-README file.
|
||
|
*/
|
||
|
|
||
|
namespace OC;
|
||
|
|
||
|
class HTTPHelper {
|
||
|
const USER_AGENT = 'ownCloud Server Crawler';
|
||
|
|
||
|
/** @var \OC\AllConfig */
|
||
|
private $config;
|
||
|
|
||
|
/**
|
||
|
* @param \OC\AllConfig $config
|
||
|
*/
|
||
|
public function __construct(AllConfig $config) {
|
||
|
$this->config = $config;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns the default context array
|
||
|
* @return array
|
||
|
*/
|
||
|
public function getDefaultContextArray() {
|
||
|
return array(
|
||
|
'http' => array(
|
||
|
'header' => 'User-Agent: ' . self::USER_AGENT . "\r\n",
|
||
|
'timeout' => 10,
|
||
|
'follow_location' => false, // Do not follow the location since we can't limit the protocol
|
||
|
),
|
||
|
'ssl' => array(
|
||
|
'disable_compression' => true
|
||
|
)
|
||
|
);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Get URL content
|
||
|
* @param string $url Url to get content
|
||
|
* @throws \Exception If the URL does not start with http:// or https://
|
||
|
* @return string of the response or false on error
|
||
|
* This function get the content of a page via curl, if curl is enabled.
|
||
|
* If not, file_get_contents is used.
|
||
|
*/
|
||
|
public function getUrlContent($url) {
|
||
|
if (!$this->isHTTPURL($url)) {
|
||
|
throw new \Exception('$url must start with https:// or http://', 1);
|
||
|
}
|
||
|
|
||
|
$proxy = $this->config->getSystemValue('proxy', null);
|
||
|
$proxyUserPwd = $this->config->getSystemValue('proxyuserpwd', null);
|
||
|
if (function_exists('curl_init')) {
|
||
|
$curl = curl_init();
|
||
|
$max_redirects = 10;
|
||
|
|
||
|
curl_setopt($curl, CURLOPT_HEADER, 0);
|
||
|
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
|
||
|
curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 10);
|
||
|
curl_setopt($curl, CURLOPT_URL, $url);
|
||
|
curl_setopt($curl, CURLOPT_PROTOCOLS, CURLPROTO_HTTP | CURLPROTO_HTTPS);
|
||
|
curl_setopt($curl, CURLOPT_REDIR_PROTOCOLS, CURLPROTO_HTTP | CURLPROTO_HTTPS);
|
||
|
|
||
|
curl_setopt($curl, CURLOPT_USERAGENT, self::USER_AGENT);
|
||
|
if ($proxy !== null) {
|
||
|
curl_setopt($curl, CURLOPT_PROXY, $proxy);
|
||
|
}
|
||
|
if ($proxyUserPwd !== null) {
|
||
|
curl_setopt($curl, CURLOPT_PROXYUSERPWD, $proxyUserPwd);
|
||
|
}
|
||
|
|
||
|
if (ini_get('open_basedir') === '' && (ini_get('safe_mode') === false) || strtolower(ini_get('safe_mode')) === 'off') {
|
||
|
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
|
||
|
curl_setopt($curl, CURLOPT_MAXREDIRS, $max_redirects);
|
||
|
$data = curl_exec($curl);
|
||
|
} else {
|
||
|
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, false);
|
||
|
$mr = $max_redirects;
|
||
|
if ($mr > 0) {
|
||
|
$newURL = curl_getinfo($curl, CURLINFO_EFFECTIVE_URL);
|
||
|
$rcurl = curl_copy_handle($curl);
|
||
|
curl_setopt($rcurl, CURLOPT_HEADER, true);
|
||
|
curl_setopt($rcurl, CURLOPT_NOBODY, true);
|
||
|
curl_setopt($rcurl, CURLOPT_FORBID_REUSE, false);
|
||
|
curl_setopt($rcurl, CURLOPT_RETURNTRANSFER, true);
|
||
|
curl_setopt($rcurl, CURLOPT_USERAGENT, self::USER_AGENT);
|
||
|
do {
|
||
|
curl_setopt($rcurl, CURLOPT_URL, $newURL);
|
||
|
$header = curl_exec($rcurl);
|
||
|
if (curl_errno($rcurl)) {
|
||
|
$code = 0;
|
||
|
} else {
|
||
|
$code = curl_getinfo($rcurl, CURLINFO_HTTP_CODE);
|
||
|
if ($code == 301 || $code == 302) {
|
||
|
preg_match('/Location:(.*?)\n/', $header, $matches);
|
||
|
$newURL = trim(array_pop($matches));
|
||
|
} else {
|
||
|
$code = 0;
|
||
|
}
|
||
|
}
|
||
|
} while ($code && --$mr);
|
||
|
curl_close($rcurl);
|
||
|
if ($mr > 0) {
|
||
|
curl_setopt($curl, CURLOPT_URL, $newURL);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if ($mr == 0 && $max_redirects > 0) {
|
||
|
$data = false;
|
||
|
} else {
|
||
|
$data = curl_exec($curl);
|
||
|
}
|
||
|
}
|
||
|
curl_close($curl);
|
||
|
} else {
|
||
|
$url = $this->getFinalLocationOfURL($url);
|
||
|
$contextArray = $this->getDefaultContextArray();
|
||
|
|
||
|
if ($proxy !== null) {
|
||
|
$contextArray['http']['proxy'] = $proxy;
|
||
|
}
|
||
|
|
||
|
$ctx = stream_context_create(
|
||
|
$contextArray
|
||
|
);
|
||
|
$data = @file_get_contents($url, 0, $ctx);
|
||
|
|
||
|
}
|
||
|
return $data;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns the response headers of a HTTP URL without following redirects
|
||
|
* @param string $location Needs to be a HTTPS or HTTP URL
|
||
|
* @return array
|
||
|
*/
|
||
|
public function getHeaders($location) {
|
||
|
stream_context_set_default($this->getDefaultContextArray());
|
||
|
return get_headers($location, 1);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Checks whether the supplied URL begins with HTTPS:// or HTTP:// (case insensitive)
|
||
|
* @param string $url
|
||
|
* @return bool
|
||
|
*/
|
||
|
public function isHTTPURL($url) {
|
||
|
return stripos($url, 'https://') === 0 || stripos($url, 'http://') === 0;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns the last HTTP or HTTPS site the request has been redirected too using the Location HTTP header
|
||
|
* This is a very ugly workaround about the missing functionality to restrict fopen() to protocols
|
||
|
* @param string $location Needs to be a HTTPS or HTTP URL
|
||
|
* @throws \Exception In case the initial URL is not a HTTP or HTTPS one
|
||
|
* @return string
|
||
|
*/
|
||
|
public function getFinalLocationOfURL($location) {
|
||
|
if(!$this->isHTTPURL($location)) {
|
||
|
throw new \Exception('URL must begin with HTTPS or HTTP.');
|
||
|
}
|
||
|
$headerArray = $this->getHeaders($location, 1);
|
||
|
|
||
|
if($headerArray !== false && isset($headerArray['Location'])) {
|
||
|
while($this->isHTTPURL($headerArray['Location'])) {
|
||
|
$location = $headerArray['Location'];
|
||
|
$headerArray = $this->getHeaders($location);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return $location;
|
||
|
}
|
||
|
|
||
|
}
|