TikTokScraper.php 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. <?php
  2. namespace App\Libraries;
  3. use \DOMXPath;
  4. use \DOMDocument;
  5. use \DOMNode;
  6. use \DOMNodeList;
  7. use GuzzleHttp\Client;
  8. use \Exception;
  9. use Illuminate\Support\Facades\Log;
  10. class TikTokScraper
  11. {
  12. /**
  13. * @var DOMDocument
  14. */
  15. private $dom;
  16. /**
  17. * @var Client
  18. */
  19. private $client;
  20. public function __construct()
  21. {
  22. $this->client = new Client();
  23. }
  24. /**
  25. * Loads specified URL, and loads HTML into scraper
  26. *
  27. * @param string $url URL to fetch
  28. * @return $this
  29. * @throws Exception
  30. */
  31. public function get($url) {
  32. $response = $this->client->request(
  33. 'GET',
  34. $url,
  35. ['headers' => self::composeHeaders()]
  36. );
  37. return $this->load($response->getBody()->getContents());
  38. }
  39. /**
  40. * Returns first matching node from DOM.
  41. *
  42. * @param string $xpath Xpath to look for in the DOM
  43. * @param null $parent Optional. Element to use as parent when querying DOM
  44. *
  45. * @return DOMNode|false Returns DOMNode on success, or false on failure
  46. * @throws Exception
  47. */
  48. public function getNode($xpath, $parent = null)
  49. {
  50. $nodes = $this->getNodes($xpath, $parent);
  51. if ($nodes->length === 0) {
  52. throw new \Exception("No nodes found matching < $xpath > xpath");
  53. }
  54. return $nodes[0];
  55. }
  56. /**
  57. * Returns all nodes matching specified XPath
  58. *
  59. * @param string $xpath Xpath to look for in the DOM
  60. * @param null $parent Optional. Element to use as parent when querying DOM
  61. *
  62. * @return DOMNodeList|false Returns list of DOMNodes on success, or false on failure
  63. */
  64. public function getNodes($xpath, $parent = null)
  65. {
  66. $domXpath = new DOMXPath($this->dom);
  67. return $domXpath->query($xpath, $parent);
  68. }
  69. /**
  70. * Extracts user data from page loaded into scraper
  71. *
  72. * @return mixed Returns userData object
  73. * @throws Exception
  74. */
  75. public function extractUserData() {
  76. try {
  77. $script = $this->getNode('//script[@id="__NEXT_DATA__"]');
  78. $data = json_decode($script->nodeValue, false);
  79. } catch (Exception $e) {
  80. Log::error("Error getting node data. Error thrown: {$e->getMessage()}");
  81. throw $e;
  82. }
  83. if ($data->props->pageProps->statusCode > 0) {
  84. throw new Exception("Error getting data from TikTok, status code {$data->props->pageProps->statusCode}");
  85. } else {
  86. return $data->props->pageProps->userData;
  87. }
  88. }
  89. /**
  90. * Extracts video data from page loaded into scraper
  91. *
  92. * @return mixed Returns videoData object
  93. * @throws Exception
  94. */
  95. public function extractVideoData() {
  96. try {
  97. $script = $this->getNode('//script[@id="__NEXT_DATA__"]');
  98. $data = json_decode($script->nodeValue, false);
  99. } catch (Exception $e) {
  100. Log::error("Error getting node data. Error thrown: {$e->getMessage()}");
  101. throw $e;
  102. }
  103. if ($data->props->pageProps->statusCode > 0) {
  104. throw new Exception("Error getting data from TikTok, status code {$data->props->pageProps->statusCode}");
  105. } else {
  106. return $data->props->pageProps->videoData;
  107. }
  108. }
  109. /**
  110. * Loads supplied HTML into DOMDocument
  111. *
  112. * @param string $html HTML to load into DOMDocument
  113. * @return $this
  114. */
  115. private function load($html)
  116. {
  117. try {
  118. $this->dom = new DOMDocument;
  119. libxml_use_internal_errors(true);
  120. $this->dom->loadHTML($html);
  121. libxml_clear_errors();
  122. } catch (Exception $e) {
  123. Log::error("There was an error loading HTML document {$e->getMessage()}");
  124. }
  125. return $this;
  126. }
  127. /**
  128. * Generates random user-agent string, to use when fetching pages
  129. *
  130. * @return array
  131. * @throws Exception
  132. */
  133. private static function composeHeaders() {
  134. return [
  135. 'User-Agent' => \Campo\UserAgent::random(),
  136. 'Accept' => '*',
  137. ];
  138. }
  139. }