From tj, 11 Months ago, written in PHP.
Embed
  1. <?php
  2. /**
  3.  * Simple Machines Forum (SMF)
  4.  *
  5.  * @package SMF
  6.  * @author Simple Machines http://www.simplemachines.org
  7.  * @copyright 2017 Simple Machines and individual contributors
  8.  * @license http://www.simplemachines.org/about/smf/license.php BSD
  9.  *
  10.  * @version 2.0.15
  11.  */
  12.  
  13. if (!defined('SMF'))
  14.         die('Hacking attempt...');
  15.  
  16. /**
  17.  * Class curl_fetch_web_data
  18.  * Simple cURL class to fetch a web page
  19.  * Properly redirects even with safe mode and basedir restrictions
  20.  * Can provide simple post options to a page
  21.  *
  22.  * Load class
  23.  * Initiate as
  24.  *  - $fetch_data = new cURL_fetch_web_data();
  25.  *      - optionally pass an array of cURL options and redirect count
  26.  *      - cURL_fetch_web_data(cURL options array, Max redirects);
  27.  *  - $fetch_data = new cURL_fetch_web_data(array(CURLOPT_SSL_VERIFYPEER => 1), 5);
  28.  *
  29.  * Make the call
  30.  *  - $fetch_data('http://www.simplemachines.org'); // fetch a page
  31.  *  - $fetch_data('http://www.simplemachines.org', array('user' => 'name', 'password' => 'password')); // post to a page
  32.  *  - $fetch_data('http://www.simplemachines.org', parameter1&parameter2&parameter3); // post to a page
  33.  *
  34.  * Get the data
  35.  *  - $fetch_data->result('body'); // just the page content
  36.  *  - $fetch_data->result(); // an array of results, body, header, http result codes
  37.  *  - $fetch_data->result_raw(); // show all results of all calls (in the event of a redirect)
  38.  *  - $fetch_data->result_raw(0); // show all results of call x
  39.  */
  40. class curl_fetch_web_data
  41. {
  42.         /**
  43.          * Set the default items for this class
  44.          *
  45.          * @var array $default_options
  46.          */
  47.         private $default_options = array(
  48.                 CURLOPT_RETURNTRANSFER  => 1, // Get returned value as a string (don't output it)
  49.                 CURLOPT_HEADER                  => 1, // We need the headers to do our own redirect
  50.                 CURLOPT_FOLLOWLOCATION  => 0, // Don't follow, we will do it ourselves so safe mode and open_basedir will dig it
  51.                 CURLOPT_USERAGENT               => 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:11.0) Gecko Firefox/11.0', // set a normal looking useragent
  52.                 CURLOPT_CONNECTTIMEOUT  => 15, // Don't wait forever on a connection
  53.                 CURLOPT_TIMEOUT                 => 90, // A page should load in this amount of time
  54.                 CURLOPT_MAXREDIRS               => 5, // stop after this many redirects
  55.                 CURLOPT_ENCODING                => 'gzip,deflate', // accept gzip and decode it
  56.                 CURLOPT_SSL_VERIFYPEER  => 0, // stop cURL from verifying the peer's certificate
  57.                 CURLOPT_SSL_VERIFYHOST  => 0, // stop cURL from verifying the peer's host
  58.                 CURLOPT_POST                    => 0, // no post data unless its passed
  59.         );
  60.  
  61.         /**
  62.         * Start the curl object
  63.         * - allow for user override values
  64.         *
  65.         * @param array $options An array of cURL options
  66.         * @param int $max_redirect Maximum number of redirects
  67.         * @return void
  68.         */
  69.         public function __construct($options = array(), $max_redirect = 3)
  70.         {
  71.                 // Initialize class variables
  72.                 $this->max_redirect = intval($max_redirect);
  73.                 $this->user_options = $options;
  74.         }
  75.  
  76.         /**
  77.         * Main calling function,
  78.         *  - will request the page data from a given $url
  79.         *  - optionally will post data to the page form if post data is supplied
  80.         *  - passed arrays will be converted to a post string joined with &'s
  81.         *  - calls set_options to set the curl opts array values based on the defaults and user input
  82.         *
  83.         * @param string $url the site we are going to fetch
  84.         * @param array $post_data any post data as form name => value
  85.         * @return object An instance of the curl_fetch_web_data class
  86.         */
  87.         public function get_url_data($url, $post_data = array())
  88.         {
  89.                 // POSTing some data perhaps?
  90.                 if (!empty($post_data) && is_array($post_data))
  91.                         $this->post_data = $this->build_post_data($post_data);
  92.                 elseif (!empty($post_data))
  93.                         $this->post_data = trim($post_data);
  94.  
  95.                 // set the options and get it
  96.                 $this->set_options();
  97.                 $this->curl_request(str_replace(' ', '%20', $url));
  98.  
  99.                 return $this;
  100.         }
  101.  
  102.         /**
  103.         * Makes the actual cURL call
  104.         *  - stores responses (url, code, error, headers, body) in the response array
  105.         *  - detects 301, 302, 307 codes and will redirect to the given response header location
  106.         *
  107.         * @param string $url The site to fetch
  108.         * @param bool $redirect Whether or not this was a redirect request
  109.         * @return void|bool Sets various properties of the class or returns false if the URL isn't specified
  110.         */
  111.         private function curl_request($url, $redirect = false)
  112.         {
  113.                 // we do have a url I hope
  114.                 if ($url == '')
  115.                         return false;
  116.                 else
  117.                         $this->options[CURLOPT_URL] = $url;
  118.  
  119.                 // if we have not already been redirected, set it up so we can if needed
  120.                 if (!$redirect)
  121.                 {
  122.                         $this->current_redirect = 1;
  123.                         $this->response = array();
  124.                 }
  125.  
  126.                 // Initialize the curl object and make the call
  127.                 $cr = curl_init();
  128.                 curl_setopt_array($cr, $this->options);
  129.                 curl_exec($cr);
  130.  
  131.                 // Get what was returned
  132.                 $curl_info = curl_getinfo($cr);
  133.                 $curl_content = curl_multi_getcontent($cr);
  134.                 $url = $curl_info['url']; // Last effective URL
  135.                 $http_code = $curl_info['http_code']; // Last HTTP code
  136.                 $body = (!curl_error($cr)) ? substr($curl_content, $curl_info['header_size']) : false;
  137.                 $error = (curl_error($cr)) ? curl_error($cr) : false;
  138.  
  139.                 // close this request
  140.                 curl_close($cr);
  141.  
  142.                 // store this 'loops' data, someone may want all of these :O
  143.                 $this->response[] = array(
  144.                         'url' => $url,
  145.                         'code' => $http_code,
  146.                         'error' => $error,
  147.                         'headers' => isset($this->headers) ? $this->headers : false,
  148.                         'body' => $body,
  149.                         'size' => $curl_info['download_content_length'],
  150.                 );
  151.  
  152.                 // If this a redirect with a location header and we have not given up, then do it again
  153.                 if (preg_match('~30[127]~i', $http_code) === 1 && $this->headers['location'] != '' && $this->current_redirect <= $this->max_redirect)
  154.                 {
  155.                         $this->current_redirect++;
  156.                         $header_location = $this->get_redirect_url($url, $this->headers['location']);
  157.                         $this->redirect($header_location, $url);
  158.                 }
  159.         }
  160.  
  161.         /**
  162.         * Used if being redirected to ensure we have a fully qualified address
  163.         *
  164.         * @param string $last_url The URL we went to
  165.         * @param string $new_url The URL we were redirected to
  166.         * @return string The new URL that was in the HTTP header
  167.         */
  168.         private function get_redirect_url($last_url = '', $new_url = '')
  169.         {
  170.                 // Get the elements for these urls
  171.                 $last_url_parse = parse_url($last_url);
  172.                 $new_url_parse  = parse_url($new_url);
  173.  
  174.                 // redirect headers are often incomplete or relative so we need to make sure they are fully qualified
  175.                 $new_url_parse['scheme'] = isset($new_url_parse['scheme']) ? $new_url_parse['scheme'] : $last_url_parse['scheme'];
  176.                 $new_url_parse['host'] = isset($new_url_parse['host']) ? $new_url_parse['host'] : $last_url_parse['host'];
  177.                 $new_url_parse['path'] = isset($new_url_parse['path']) ? $new_url_parse['path'] : $last_url_parse['path'];
  178.                 $new_url_parse['query'] = isset($new_url_parse['query']) ? $new_url_parse['query'] : '';
  179.  
  180.                 // Build the new URL that was in the http header
  181.                 return $new_url_parse['scheme'] . '://' . $new_url_parse['host'] . $new_url_parse['path'] . (!empty($new_url_parse['query']) ? '?' . $new_url_parse['query'] : '');
  182.         }
  183.  
  184.         /**
  185.         * Used to return the results to the calling program
  186.         *  - called as ->result() will return the full final array
  187.         *  - called as ->result('body') to just return the page source of the result
  188.         *
  189.         * @param string $area Used to return an area such as body, header, error
  190.         * @return string The response
  191.         */
  192.         public function result($area = '')
  193.         {
  194.                 $max_result = count($this->response) - 1;
  195.  
  196.                 // just return a specifed area or the entire result?
  197.                 if ($area == '')
  198.                         return $this->response[$max_result];
  199.                 else
  200.                         return isset($this->response[$max_result][$area]) ? $this->response[$max_result][$area] : $this->response[$max_result];
  201.         }
  202.  
  203.         /**
  204.         * Will return all results from all loops (redirects)
  205.         *  - Can be called as ->result_raw(x) where x is a specific loop results.
  206.         *  - Call as ->result_raw() for everything.
  207.         *
  208.         * @param string $response_number Which response we want to get
  209.         * @return array|string The entire response array or just the specified response
  210.         */
  211.         public function result_raw($response_number = '')
  212.         {
  213.                 if (!is_numeric($response_number))
  214.                         return $this->response;
  215.                 else
  216.                 {
  217.                         $response_number = min($response_number, count($this->response) - 1);
  218.                         return $this->response[$response_number];
  219.                 }
  220.         }
  221.  
  222.         /**
  223.         * Takes supplied POST data and url encodes it
  224.         *  - forms the date (for post) in to a string var=xyz&var2=abc&var3=123
  225.         *  - drops vars with @ since we don't support sending files (uploading)
  226.         *
  227.         * @param array|string $post_data The raw POST data
  228.         * @return string A string of post data
  229.         */
  230.         private function build_post_data($post_data)
  231.         {
  232.                 if (is_array($post_data))
  233.                 {
  234.                         $postvars = array();
  235.  
  236.                         // build the post data, drop ones with leading @'s since those can be used to send files, we don't support that.
  237.                         foreach ($post_data as $name => $value)
  238.                                 $postvars[] = $name . '=' . urlencode($value[0] == '@' ? '' : $value);
  239.  
  240.                         return implode('&', $postvars);
  241.                 }
  242.                 else
  243.                         return $post_data;
  244.  
  245.         }
  246.  
  247.         /**
  248.         * Sets the final cURL options for the current call
  249.         *  - overwrites our default values with user supplied ones or appends new user ones to what we have
  250.         *  - sets the callback function now that $this is existing
  251.         * @return void
  252.         */
  253.         private function set_options()
  254.         {
  255.                 // Callback to parse the returned headers, if any
  256.                 $this->default_options[CURLOPT_HEADERFUNCTION] = array($this, 'header_callback');
  257.  
  258.                 // Any user options to account for
  259.                 if (is_array($this->user_options))
  260.                 {
  261.                         $keys = array_merge(array_keys($this->default_options), array_keys($this->user_options));
  262.                         $vals = array_merge($this->default_options, $this->user_options);
  263.                         $this->options = array_combine($keys, $vals);
  264.                 }
  265.                 else
  266.                         $this->options = $this->default_options;
  267.  
  268.                 // POST data options, here we don't allow any overide
  269.                 if (isset($this->post_data))
  270.                 {
  271.                         $this->options[CURLOPT_POST] = 1;
  272.                         $this->options[CURLOPT_POSTFIELDS] = $this->post_data;
  273.                 }
  274.         }
  275.  
  276.         /**
  277.         * Called to initiate a redirect from a 301, 302 or 307 header
  278.         *  - resets the cURL options for the loop, sets the referrer flag
  279.         *
  280.         * @param string $target_url The URL we want to redirect to
  281.         * @param string $referer_url The URL that we're redirecting from
  282.         */
  283.         private function redirect($target_url, $referer_url)
  284.         {
  285.                 // no no I last saw that over there ... really, 301, 302, 307
  286.                 $this->set_options();
  287.                 $this->options[CURLOPT_REFERER] = $referer_url;
  288.                 $this->curl_request($target_url, true);
  289.         }
  290.  
  291.         /**
  292.         * Callback function to parse returned headers
  293.         *  - lowercases everything to make it consistent
  294.         *
  295.         * @param type $cr Not sure what this is used for?
  296.         * @param string $header The header
  297.         * @return int The length of the header
  298.         */
  299.         private function header_callback($cr, $header)
  300.         {
  301.                 $_header = trim($header);
  302.                 $temp = explode(': ', $_header, 2);
  303.  
  304.                 // set proper headers only
  305.                 if (isset($temp[0]) && isset($temp[1]))
  306.                         $this->headers[strtolower($temp[0])] = trim($temp[1]);
  307.  
  308.                 // return the length of what was passed unless you want a Failed writing header error ;)
  309.                 return strlen($header);
  310.         }
  311. }
  312.  
  313. ?>
  314.