complete rewrite of comment scraper, caching features still missing

This commit is contained in:
2019-10-16 18:52:43 +02:00
parent e678442198
commit 6ae152a3d1
6 changed files with 161 additions and 1152 deletions

View File

@@ -1,183 +1,159 @@
<?php
/**
* @copyright Copyright (c) 2018 Bjoern Schiessle <bjoern@schiessle.org>
*
* @license GNU AGPL version 3 or any later version
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
require_once 'Mastodon_api.php';
require_once '../../mastodon.feed/config.php';
require_once 'config.php';
$instance = $config['mastodon-instance'];
$uid = $config['user-id'];
$searchurl = $config['search-url'];
$search = isset($_GET['search']) ? $_GET['search'] : '';
/* cache files */
$dbt = "cache-toots.json";
class CollectMastodonData {
/** @var \Mastodon_api */
private $api;
/** @var string url of the mastodon instance */
private $mastodonUrl = 'https://mastodon.social';
/** @var string token to authenticate at the mastodon instance */
private $bearerToken;
/** @var int keep cache at least 600 seconds = 10 minutes */
private $threshold = 600;
/** @var string uid on the mastodon instance */
private $uid;
/** @var array cached comments from previous searches */
private $commentCache = [];
private $cacheFile = 'myCommentsCache.json';
public function __construct($config) {
$this->mastodonUrl = $config['mastodon-instance'];
$this->bearerToken = $config['token'];
$this->uid = $config['user-id'];
$this->api = new Mastodon_api();
$this->api->set_url($this->mastodonUrl);
$this->api->set_token($this->bearerToken, 'bearer');
}
private function filterComments($descendants, $root, &$result) {
foreach ($descendants as $d) {
$result['comments'][$d['id']] = [
'author' => [
'display_name' => $d['account']['display_name'] ? $d['account']['display_name'] : $d['account']['username'],
'avatar' => $d['account']['avatar_static'],
'url' => $d['account']['url']
],
'toot' => $d['content'],
'date' => $d['created_at'],
'url' => $d['uri'],
'reply_to' => $d['in_reply_to_id'],
'root' => $root,
];
}
return $result;
}
private function filterStats($stats) {
$result = [
'reblogs' => (int)$stats['reblogs_count'],
'favs' => (int)$stats['favourites_count'],
'replies' => (int)$stats['replies_count'],
'url' => $stats['url']
];
return $result;
}
private function filterSearchResults($searchResult) {
$result = [];
if (isset($searchResult['html']['statuses'])) {
foreach ($searchResult['html']['statuses'] as $status) {
if ($status['in_reply_to_id'] === null) {
$result[] = $status['id'];
}
}
}
sort($result);
return $result;
}
/**
* find all toots for a given blog post and return the corresponding IDs
*
* @param string $search
* @return array
*/
public function findToots($search) {
$result = $this->api->search(['q' => $search]);
return $this->filterSearchResults($result);
}
public function getComments($id, &$result) {
$raw = file_get_contents("https://mastodon.social/api/v1/statuses/$id/context");
$json = json_decode($raw, true);
$this->filterComments($json['descendants'], $id, $result);
}
public function getStatistics($id, &$result) {
$raw = file_get_contents("https://mastodon.social/api/v1/statuses/$id");
$json = json_decode($raw, true);
$newStats = $this->filterStats($json);
$result['stats']['reblogs'] += $newStats['reblogs'];
$result['stats']['favs'] += $newStats['favs'];
$result['stats']['replies'] += $newStats['replies'];
if (empty($result['stats']['url'])) {
$result['stats']['url'] = $newStats['url'];
}
}
public function storeCollection($id, $comments) {
$timestamp = time();
$comments['timestamp'] = $timestamp;
$this->commentCache[$id] = $comments;
file_put_contents($this->cacheFile, json_encode($this->commentCache));
}
public function getCachedCollection($search) {
if (file_exists($this->cacheFile)) {
$cachedComments = file_get_contents($this->cacheFile);
$cachedCommentsArray = json_decode($cachedComments, true);
if (is_array($cachedCommentsArray)) {
$this->commentCache = $cachedCommentsArray;
$currentTimestamp = time();
if (isset($cachedCommentsArray[$search])) {
if ((int)$cachedCommentsArray[$search]['timestamp'] + $this->threshold > $currentTimestamp) {
unset($cachedCommentsArray[$search]['timestamp']);
return $cachedCommentsArray[$search];
}
}
}
}
return [];
}
/* MISC FUNCTIONS */
function out($data) {
error_log("[getcomments.php] " . print_r($data, TRUE));
}
/* CACHE FUNCTIONS */
/* write data to file */
function write_db($db, $data) {
// encode and write file
$encoded = json_encode($data, JSON_PRETTY_PRINT);
file_put_contents($db, $encoded, LOCK_EX);
}
/* access data from file */
function read_db($db, &$data) {
if (! file_exists($db)) {
touch($db);
}
$file = file_get_contents($db, true);
$data = json_decode($file, true);
}
/* TOOT FUNCTIONS */
function collectToots($instance, $uid, $min_id) {
$raw = file_get_contents("$instance/api/v1/accounts/$uid/statuses?exclude_reblogs=true&exclude_replies=true&limit=50&min_id=$min_id");
$json = json_decode($raw, true);
return($json);
}
/* Collect all the toots */
$toots = array();
/* get id of latest cached toot, and set as $min_id */
read_db($dbt, $toots);
if (!empty($toots['0']['id'])) {
$min_id_cached = $toots['0']['id'];
$min_id = $min_id_cached;
} else {
/* if cached toots do not exist, start from oldest toot */
$min_id = "0";
}
/* test whether there are new toots available */
$min_id_new = "1";
while ($min_id_new !== $min_id) {
$min_id_new = $min_id;
$toots = array_merge(collectToots($instance, $uid, $min_id), $toots);
$min_id = $toots['0']['id'];
}
/* if newer toot has been found, find new URLs */
// TODO: only look up newly found toots
if ($min_id !== $min_id_cached) {
out("New toots found");
$ids = array_column($toots, 'id');
/* Find out if a toot contains the searched URL */
function analyzeToot($instance, $id, $searchurl) {
$raw = file_get_contents("$instance/api/v1/statuses/$id");
$json = json_decode($raw, true);
preg_match("|$searchurl.+?(?=\")|i", $json['content'], $matches);
if(!empty($matches)) {
return(strtolower($matches[0]));
} else {
return("");
}
}
$toots = array();
foreach ($ids as $id) {
$toots[] = array('id' => $id, 'url' => analyzeToot($instance, $id, $searchurl));
}
write_db($dbt, $toots);
} else {
out("No new toots found");
}
/* check if URL from $search exists in $toots */
// if multiple exist, take the oldest one (highest array position)
$id = array_keys(array_column($toots, 'url'), strtolower($search));
$id = $toots[end($id)]['id'];
// TODO: graceful exit if no toot exists
$id = "102955148581768112"; // TODO test
/* Extract comments and stats from toot */
$result = ['comments' => [], 'stats' => ['reblogs' => 0, 'favs' => 0, 'replies' => 0, 'url' => '', 'root' => 0]];
$search = isset($_GET['search']) ? $_GET['search'] : '';
$collector = new CollectMastodonData($config);
$ids = [];
if (!empty($search)) {
$oldCollection = $collector->getCachedCollection($search);
if (empty($oldCollection)) {
$ids = $collector->findToots($search);
$result['stats']['root'] = isset($ids[0]) ? $ids[0] : 0;
foreach ($ids as $id) {
// get comments
$newComments = $collector->getComments($id, $result);
// get statistics (likes, replies, boosts,...)
$collector->getStatistics($id, $result);
// FIXME: At the moment the API doesn't return the correct replies count so I count it manually
$result['stats']['replies'] = count($result['comments']);
}
$collector->storeCollection($search, $result);
} else {
$result = $oldCollection;
}
function filterComments($descendants, $root, &$result) {
foreach ($descendants as $d) {
$result['comments'][$d['id']] = [
'author' => [
'display_name' => $d['account']['display_name'] ? $d['account']['display_name'] : $d['account']['username'],
'avatar' => $d['account']['avatar_static'],
'url' => $d['account']['url']
],
'toot' => $d['content'],
'date' => $d['created_at'],
'url' => $d['uri'],
'reply_to' => $d['in_reply_to_id'],
'root' => $root,
];
}
return $result;
}
function tootContext($instance, $id, &$result) {
$raw = file_get_contents("$instance/api/v1/statuses/$id/context");
$json = json_decode($raw, true);
filterComments($json['descendants'], $id, $result);
}
function filterStats($stats) {
$result = [
'reblogs' => (int)$stats['reblogs_count'],
'favs' => (int)$stats['favourites_count'],
'replies' => (int)$stats['replies_count'],
'url' => $stats['url']
];
return $result;
}
function tootStats($instance, $id, &$result) {
out("Checking ID $id");
$raw = file_get_contents("$instance/api/v1/statuses/$id");
$json = json_decode($raw, true);
$newStats = filterStats($json);
$result['stats']['reblogs'] += $newStats['reblogs'];
$result['stats']['favs'] += $newStats['favs'];
$result['stats']['replies'] += $newStats['replies'];
if (empty($result['stats']['url'])) {
$result['stats']['url'] = $newStats['url'];
}
}
// FIXME: At the moment the API doesn't return the correct replies count so I count it manually
$result['stats']['replies'] = count($result['comments']);
$result['stats']['root'] = $id;
tootContext($instance, $id, $result);
tootStats($instance, $id, $result);
// headers for not caching the results
header('Cache-Control: no-cache, must-revalidate');
header('Expires: Mon, 26 Jul 1997 05:00:00 GMT');
@@ -185,5 +161,6 @@ header('Expires: Mon, 26 Jul 1997 05:00:00 GMT');
// headers to tell that result is JSON
header('Content-type: application/json');
// send the result now
echo json_encode($result);
?>