| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152 |
- <?php
- namespace App\Libraries;
- use \DOMXPath;
- use \DOMDocument;
- use \DOMNode;
- use \DOMNodeList;
- use GuzzleHttp\Client;
- use \Exception;
- use Illuminate\Support\Facades\Log;
- class TikTokScraper
- {
- /**
- * @var DOMDocument
- */
- private $dom;
- /**
- * @var Client
- */
- private $client;
- public function __construct()
- {
- $this->client = new Client();
- }
- /**
- * Loads specified URL, and loads HTML into scraper
- *
- * @param string $url URL to fetch
- * @return $this
- * @throws Exception
- */
- public function get($url) {
- $response = $this->client->request(
- 'GET',
- $url,
- ['headers' => self::composeHeaders()]
- );
- return $this->load($response->getBody()->getContents());
- }
- /**
- * Returns first matching node from DOM.
- *
- * @param string $xpath Xpath to look for in the DOM
- * @param null $parent Optional. Element to use as parent when querying DOM
- *
- * @return DOMNode|false Returns DOMNode on success, or false on failure
- * @throws Exception
- */
- public function getNode($xpath, $parent = null)
- {
- $nodes = $this->getNodes($xpath, $parent);
- if ($nodes->length === 0) {
- throw new \Exception("No nodes found matching < $xpath > xpath");
- }
- return $nodes[0];
- }
- /**
- * Returns all nodes matching specified XPath
- *
- * @param string $xpath Xpath to look for in the DOM
- * @param null $parent Optional. Element to use as parent when querying DOM
- *
- * @return DOMNodeList|false Returns list of DOMNodes on success, or false on failure
- */
- public function getNodes($xpath, $parent = null)
- {
- $domXpath = new DOMXPath($this->dom);
- return $domXpath->query($xpath, $parent);
- }
- /**
- * Extracts user data from page loaded into scraper
- *
- * @return mixed Returns userData object
- * @throws Exception
- */
- public function extractUserData() {
- try {
- $script = $this->getNode('//script[@id="__NEXT_DATA__"]');
- $data = json_decode($script->nodeValue, false);
- } catch (Exception $e) {
- Log::error("Error getting node data. Error thrown: {$e->getMessage()}");
- throw $e;
- }
- if ($data->props->pageProps->statusCode > 0) {
- throw new Exception("Error getting data from TikTok, status code {$data->props->pageProps->statusCode}");
- } else {
- return $data->props->pageProps->userData;
- }
- }
- /**
- * Extracts video data from page loaded into scraper
- *
- * @return mixed Returns videoData object
- * @throws Exception
- */
- public function extractVideoData() {
- try {
- $script = $this->getNode('//script[@id="__NEXT_DATA__"]');
- $data = json_decode($script->nodeValue, false);
- } catch (Exception $e) {
- Log::error("Error getting node data. Error thrown: {$e->getMessage()}");
- throw $e;
- }
- if ($data->props->pageProps->statusCode > 0) {
- throw new Exception("Error getting data from TikTok, status code {$data->props->pageProps->statusCode}");
- } else {
- return $data->props->pageProps->videoData;
- }
- }
- /**
- * Loads supplied HTML into DOMDocument
- *
- * @param string $html HTML to load into DOMDocument
- * @return $this
- */
- private function load($html)
- {
- try {
- $this->dom = new DOMDocument;
- libxml_use_internal_errors(true);
- $this->dom->loadHTML($html);
- libxml_clear_errors();
- } catch (Exception $e) {
- Log::error("There was an error loading HTML document {$e->getMessage()}");
- }
- return $this;
- }
- /**
- * Generates random user-agent string, to use when fetching pages
- *
- * @return array
- * @throws Exception
- */
- private static function composeHeaders() {
- return [
- 'User-Agent' => \Campo\UserAgent::random(),
- 'Accept' => '*',
- ];
- }
- }
|