📄 Viewing: SpellChecker.php
<?php
/* Finds similar pages.
* Finds search suggestions. */
class ABJ_404_Solution_SpellChecker {
private $separatingCharacters = array("-","_",".","~",'%20');
/** Same as above except without the period (.) because of the extension in the file name. */
private $separatingCharactersForImages = array("-","_","~",'%20');
private $publishedPostsProvider = null;
const MAX_DIST = 2083;
private static $instance = null;
private $custom404PageID = null;
public static function getInstance() {
if (self::$instance == null) {
self::$instance = new ABJ_404_Solution_SpellChecker();
// set the custom 404 page id if there is one
$abj404logic = ABJ_404_Solution_PluginLogic::getInstance();
$options = $abj404logic->getOptions();
$me = self::$instance;
$custom404PageID =
(array_key_exists('dest404page', $options) && isset($options['dest404page']) ?
$options['dest404page'] : null);
if ($abj404logic->thereIsAUserSpecified404Page($custom404PageID)) {
$me->custom404PageID = $custom404PageID;
}
}
return self::$instance;
}
static function init() {
// any time a page is saved or updated, or the permalink structure changes, then we have to clear
// the spelling cache because the results may have changed.
$me = ABJ_404_Solution_SpellChecker::getInstance();
add_action('updated_option', array($me,'permalinkStructureChanged'), 10, 2);
add_action('save_post', array($me,'save_postListener'), 10, 3);
add_action('delete_post', array($me,'delete_postListener'), 10, 2);
}
function save_postListener($post_id, $post = null, $update = null) {
if ($post == null) {
$post = get_post($post_id);
}
if ($update == null) {
$update = true;
}
$this->savePostHandler($post_id, $post, $update, 'save');
}
function delete_postListener($post_id, $post = null) {
if ($post == null) {
$post = get_post($post_id);
}
$this->savePostHandler($post_id, $post, true, 'delete');
}
function savePostHandler($post_id, $post, $update, $saveOrDelete) {
$abj404dao = ABJ_404_Solution_DataAccess::getInstance();
$abj404logging = ABJ_404_Solution_Logging::getInstance();
$abj404logic = ABJ_404_Solution_PluginLogic::getInstance();
$f = ABJ_404_Solution_Functions::getInstance();
$options = $abj404logic->getOptions();
$postType = $post->post_type;
$acceptedPostTypes = $f->explodeNewline($options['recognized_post_types']);
// 3 options: save a new page, save an existing page (update), delete a page.
$deleteSpellingCache = false;
$deleteFromPermalinkCache = false;
$reason = '';
// 2: save an existing page. if any of the following changed then delete
// from the permalink cache: slug, type, status.
// if any of the following changed then delete the entire spelling cache:
// slug, type, status.
$cacheRow = $abj404dao->getPermalinkEtcFromCache($post_id);
$cacheRow = (isset($cacheRow)) ? $cacheRow : array();
$oldSlug = (array_key_exists('url', $cacheRow)) ?
rtrim(ltrim($cacheRow['url'], '/'), '/') : '(not found)';
$newSlug = $post->post_name;
$matches = array();
$metaRow = array_key_exists('meta', $cacheRow) ? $cacheRow['meta'] : '';
preg_match('/s:(\\w+?),/', $metaRow, $matches);
$oldStatus = count($matches) > 1 ? $matches[1] : '(not found)';
preg_match('/t:(\\w+?),/', $metaRow, $matches);
$oldPostType = count($matches) > 1 ? $matches[1] : '(not found)';
if ($update && $saveOrDelete == 'save' &&
($oldSlug != $newSlug ||
$oldStatus != $post->post_status ||
$oldPostType != $post->post_type)
) {
$deleteSpellingCache = true; // TODO only delete where the page is referenced.
$deleteFromPermalinkCache = true;
$reason = 'change. slug (' . $oldSlug . '(to)' . $newSlug . '), status (' .
$oldStatus . '(to)' . $post->post_status . '), type (' . $oldPostType .
'(to)' . $post->post_type . ')';
}
// if the post type is uninteresting then ignore it.
if (!in_array($oldPostType, $acceptedPostTypes) &&
!in_array($post->post_type, $acceptedPostTypes)) {
$httpUserAgent = "(none)";
if (array_key_exists("HTTP_USER_AGENT", $_SERVER)) {
$httpUserAgent = $_SERVER['HTTP_USER_AGENT'];
}
$abj404logging->debugMessage(__CLASS__ . "/" . __FUNCTION__ .
": Ignored savePost change (uninteresting post types). " .
"Action: " . $saveOrDelete . ", ID: " . $post_id . ", types: " .
$oldPostType . "/" . $post->post_type . ", agent: " .
$httpUserAgent);
return;
}
// if the status is uninteresting then ignore it.
$interestingStatuses = array('publish', 'published');
if (!in_array($oldStatus, $interestingStatuses) &&
!in_array($post->post_status, $interestingStatuses)) {
$httpUserAgent = "(none)";
if (array_key_exists("HTTP_USER_AGENT", $_SERVER)) {
$httpUserAgent = $_SERVER['HTTP_USER_AGENT'];
}
$abj404logging->debugMessage(__CLASS__ . "/" . __FUNCTION__ .
": Ignored savePost change (uninteresting post statuses). " .
"Action: " . $saveOrDelete . ", ID: " . $post_id . ", statuses: " .
$oldStatus . "/" . $post->post_status . ", agent: " .
$httpUserAgent);
return;
}
// save a new page. the cache is null. delete the spelling cache because
// the new page may match searches better than the other previous matches.
if (!$update && $saveOrDelete == 'save') {
$deleteSpellingCache = true; // delete all.
$deleteFromPermalinkCache = false; // it's not there anyway.
$reason = 'new page';
}
// delete a page.
if ($saveOrDelete == 'delete') {
$deleteSpellingCache = true; // TODO only delete where the page is referenced.
$deleteFromPermalinkCache = true;
$reason = 'deleted page';
}
if ($deleteFromPermalinkCache) {
$abj404logging->debugMessage(__CLASS__ . "/" . __FUNCTION__ .
": Delete from permalink cache: " . $post_id . ", action: " .
$saveOrDelete . ", reason: " . $reason);
$abj404dao->removeFromPermalinkCache($post_id);
// let's update some links.
$plCache = ABJ_404_Solution_PermalinkCache::getInstance();
$plCache->updatePermalinkCache(0.1);
}
if ($deleteSpellingCache) {
// TODO only delete the items from the cache that refer
// to the post ID that was deleted?
$abj404dao = ABJ_404_Solution_DataAccess::getInstance();
$abj404dao->deleteSpellingCache();
if ($abj404logging->isDebug()) {
$httpUserAgent = "(none)";
if (array_key_exists("HTTP_USER_AGENT", $_SERVER)) {
$httpUserAgent = $_SERVER['HTTP_USER_AGENT'];
}
$abj404logging->debugMessage(__CLASS__ . "/" . __FUNCTION__ .
": Spelling cache deleted (post change). Action: " . $saveOrDelete .
", ID: " . $post_id . ", type: " . $postType . ", reason: " .
$reason . ", agent: " . $httpUserAgent);
}
}
}
function permalinkStructureChanged($var1, $newStructure) {
if ($var1 != 'permalink_structure') {
return;
}
$structure = empty($newStructure) ? '(empty)' : $newStructure;
$abj404dao = ABJ_404_Solution_DataAccess::getInstance();
$abj404logging = ABJ_404_Solution_Logging::getInstance();
$abj404dao->deleteSpellingCache();
$abj404logging->debugMessage(__CLASS__ . "/" . __FUNCTION__ . ": Spelling cache deleted because the permalink structure changed " . "to " . $structure);
}
/** Find a match using the user-defined regex patterns.
* @global type $abj404dao
* @param string $requestedURL
* @return array
*/
function getPermalinkUsingRegEx($requestedURL) {
$abj404dao = ABJ_404_Solution_DataAccess::getInstance();
$f = ABJ_404_Solution_Functions::getInstance();
$abj404logic = ABJ_404_Solution_PluginLogic::getInstance();
$options = $abj404logic->getOptions();
$regexURLsRows = $abj404dao->getRedirectsWithRegEx();
foreach ($regexURLsRows as $row) {
$regexURL = $row['url'];
$_REQUEST[ABJ404_PP]['debug_info'] = 'Applying custom regex "' . $regexURL . '" to URL: ' .
$requestedURL;
$preparedURL = $f->str_replace('/', '\/', $regexURL);
if ($f->regexMatch($preparedURL, $requestedURL)) {
$_REQUEST[ABJ404_PP]['debug_info'] = 'Cleared after regex.';
$idAndType = $row['final_dest'] . '|' . $row['type'];
$permalink = ABJ_404_Solution_Functions::permalinkInfoToArray($idAndType, '0',
null, $options);
$permalink['matching_regex'] = $regexURL;
$originalPermalink = $permalink;
// if the matching regex contains a group and the destination contains a replacement,
// then use them
$regexMatchResult = $f->regexMatch("\.*\(.+\).*", $regexURL);
$replacementStrPosResult = $f->strpos($permalink['link'], '$');
if (($regexMatchResult != 0) && ($replacementStrPosResult !== FALSE)) {
$results = array();
$f->regexMatch($regexURL, $requestedURL, $results);
// do a repacement for all of the groups found.
$final = $permalink['link'];
for ($x = 1; $x < count($results); $x++) {
$final = $f->str_replace('$' . $x, $results[$x], $final);
}
$permalink['link'] = $final;
}
$abj404logging = ABJ_404_Solution_Logging::getInstance();
$abj404logging->debugMessage("Found matching regex. Original permalink" .
json_encode($originalPermalink) . ", final: " .
json_encode($permalink));
return $permalink;
}
$_REQUEST[ABJ404_PP]['debug_info'] = 'Cleared after regex.';
}
return null;
}
/** Find a match using the an exact slug match.
* If there is a post that has a slug that matches the user requested slug exactly,
* then return the permalink for that post. Otherwise return null.
* @global type $abj404dao
* @param string $requestedURL
* @return array|null
*/
function getPermalinkUsingSlug($requestedURL) {
$abj404dao = ABJ_404_Solution_DataAccess::getInstance();
$abj404logging = ABJ_404_Solution_Logging::getInstance();
$exploded = array_filter(explode('/', $requestedURL));
if ($exploded == null || empty($exploded)) {
return null;
}
$postSlug = end($exploded);
$postsBySlugRows = $abj404dao->getPublishedPagesAndPostsIDs($postSlug);
if (count($postsBySlugRows) == 1) {
$post = reset($postsBySlugRows);
$permalink = array();
$permalink['id'] = $post->id;
$permalink['type'] = ABJ404_TYPE_POST;
// the score doesn't matter.
$permalink['score'] = 100;
$permalink['title'] = get_the_title($post->id);
$permalink['link'] = get_permalink($post->id);
return $permalink;
} else if (count($postsBySlugRows) > 1) {
// more than one post has the same slug. I don't know what to do.
$abj404logging->debugMessage("More than one post found with the slug, so no redirect was " .
"created. Slug: " . $postSlug);
} else {
$abj404logging->debugMessage("No posts or pages matching slug: " . esc_html($postSlug));
}
return null;
}
/** Find a match using the an exact slug match.
* Use spell checking to find the correct link. Return the permalink (map) if there is one, otherwise return null.
* @global type $abj404spellChecker
* @global type $abj404logic
* @param string $requestedURL
* @return array|null
*/
function getPermalinkUsingSpelling($requestedURL) {
$abj404spellChecker = ABJ_404_Solution_SpellChecker::getInstance();
$abj404logic = ABJ_404_Solution_PluginLogic::getInstance();
$abj404logging = ABJ_404_Solution_Logging::getInstance();
$options = $abj404logic->getOptions();
if (@$options['auto_redirects'] == '1') {
// Site owner wants automatic redirects.
$permalinksPacket = $abj404spellChecker->findMatchingPosts($requestedURL,
$options['auto_cats'], $options['auto_tags']);
$permalinks = $permalinksPacket[0];
$rowType = $permalinksPacket[1];
$minScore = $options['auto_score'];
// since the links were previously sorted so that the highest score would be first,
// we only use the first element of the array;
$linkScore = reset($permalinks);
$idAndType = key($permalinks);
$permalink = ABJ_404_Solution_Functions::permalinkInfoToArray($idAndType, $linkScore,
$rowType, $options);
if ($permalink['score'] >= $minScore) {
// We found a permalink that will work!
$redirectType = $permalink['type'];
if (('' . $redirectType != ABJ404_TYPE_404_DISPLAYED) && ('' . $redirectType != ABJ404_TYPE_HOME)) {
return $permalink;
} else {
$abj404logging->errorMessage("Unhandled permalink type: " .
wp_kses_post(json_encode($permalink)));
return null;
}
}
}
return null;
}
/**
* Return true if the last characters of the URL represent an image extension (like jpg, gif, etc).
* @param string $requestedURL
*/
function requestIsForAnImage($requestedURL) {
$f = ABJ_404_Solution_Functions::getInstance();
$imageExtensions = array(".jpg", ".jpeg", ".gif", ".png", ".tif", ".tiff", ".bmp", ".pdf",
".jif", ".jif", ".jp2", ".jpx", ".j2k", ".j2c", ".pcd");
$returnVal = false;
foreach ($imageExtensions as $extension) {
if ($f->endsWithCaseInsensitive($requestedURL, $extension)) {
$returnVal = true;
break;
}
}
return $returnVal;
}
/** Returns a list of
* @global type $wpdb
* @param string $requestedURLRaw
* @param string $includeCats
* @param string $includeTags
* @return array
*/
function findMatchingPosts($requestedURLRaw, $includeCats = '1', $includeTags = '1') {
$f = ABJ_404_Solution_Functions::getInstance();
$abj404dao = ABJ_404_Solution_DataAccess::getInstance();
$abj404logic = ABJ_404_Solution_PluginLogic::getInstance();
$options = $abj404logic->getOptions();
// the number of pages to cache is (max suggestions) + (the number of exlude pages).
// (if either of these numbers increases then we need to clear the spelling cache.)
$excluePagesCount = 0;
if (!trim($options['excludePages[]']) == '') {
$jsonResult = json_decode($options['excludePages[]']);
if (!is_array($jsonResult)) {
$jsonResult = array($jsonResult);
}
$excluePagesCount = count($jsonResult);
}
$maxCacheCount = absint($options['suggest_max']) + $excluePagesCount;
$requestedURLSpaces = $f->str_replace($this->separatingCharacters, " ", $requestedURLRaw);
$requestedURLCleaned = $this->getLastURLPart($requestedURLSpaces);
$fullURLspacesCleaned = $f->str_replace('/', " ", $requestedURLSpaces);
// if there is no extra stuff in the path then we ignore this to save time.
if ($fullURLspacesCleaned == $requestedURLCleaned) {
$fullURLspacesCleaned = '';
}
// prepare to get some posts.
$this->initializePublishedPostsProvider();
$rowType = 'pages';
$permalinks = array();
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - match on posts
$permalinks = $this->matchOnPosts($permalinks, $requestedURLRaw, $requestedURLCleaned,
$fullURLspacesCleaned, $rowType);
// if we only need images then we're done.
if ($rowType == 'image') {
// This is sorted so that the link with the highest score will be first when iterating through.
arsort($permalinks);
$anArray = array($permalinks,$rowType);
return $anArray;
}
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - match on tags
// search for a similar tag.
if ($includeTags == "1") {
$permalinks = $this->matchOnTags($permalinks, $requestedURLCleaned, $fullURLspacesCleaned, 'tags');
}
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - match on categories
// search for a similar category.
if ($includeCats == "1") {
$permalinks = $this->matchOnCats($permalinks, $requestedURLCleaned, $fullURLspacesCleaned, 'categories');
}
// remove excluded pages
$permalinks = $this->removeExcludedPages($options, $permalinks);
// This is sorted so that the link with the highest score will be first when iterating through.
arsort($permalinks);
$permalinks = $this->removeExcludedPagesWithRegex($options, $permalinks, $maxCacheCount);
// only keep what we need. store them for later if necessary.
$permalinks = array_splice($permalinks, 0, $maxCacheCount);
$returnValue = array($permalinks,$rowType);
$abj404dao->storeSpellingPermalinksToCache($requestedURLRaw, $returnValue);
$_REQUEST[ABJ404_PP]['permalinks_found'] = json_encode($returnValue);
$_REQUEST[ABJ404_PP]['permalinks_kept'] = json_encode($permalinks);
return $returnValue;
}
function removeExcludedPages($options, $permalinks) {
$excludePagesJson = $options['excludePages[]'];
if (trim($excludePagesJson) == '' && $this->custom404PageID == null) {
return $permalinks;
}
// look at every ID to exclude.
$excludePages = json_decode($excludePagesJson);
if (!is_array($excludePages)) {
$excludePages = array($excludePages);
}
// don't include the user specified 404 page in the spelling results..
if ($this->custom404PageID != null) {
array_push($excludePages, $this->custom404PageID);
}
for ($i = 0; $i < count($excludePages); $i++) {
$excludePage = $excludePages[$i];
if ($excludePage == null || trim($excludePage) == '') {
continue;
}
$items = explode("|\\|", $excludePage);
$idAndTypeToExclude = $items[0];
// remove it from the results list.
unset($permalinks[$idAndTypeToExclude]);
}
return $permalinks;
}
/**
* Removes permalink suggestions if their URL path matches exclusion regex patterns.
*
* @param array $options Plugin options containing 'suggest_regex_exclusions_usable'.
* @param array $permalinks An array where keys are "ID|TYPE_CONSTANT" and values are scores.
* Example: [ '1204|1' => '70.0000', '2194|1' => '68.3333' ]
* @return array The filtered $permalinks array.
*/
function removeExcludedPagesWithRegex($options, $permalinks, $maxCacheCount) {
// Ensure dependencies are available
$f = ABJ_404_Solution_Functions::getInstance();
$abj404logic = ABJ_404_Solution_PluginLogic::getInstance();
$abj404logging = ABJ_404_Solution_Logging::getInstance(); // Optional: for logging errors
// Ensure permalinks is an array
if (!is_array($permalinks)) {
return $permalinks;
}
// Check if usable regex patterns exist and are in an array format
if (!isset($options['suggest_regex_exclusions_usable']) ||
!is_array($options['suggest_regex_exclusions_usable']) ||
empty($options['suggest_regex_exclusions_usable'])) {
// No patterns to apply, return original list
return $permalinks;
}
$suggestionsKeptSoFar = 0;
$regexExclusions = $options['suggest_regex_exclusions_usable'];
// Iterate through each permalink entry using keys directly
// Modifying array while iterating requires careful handling, using keys is safer.
$keys_to_check = array_keys($permalinks);
foreach ($keys_to_check as $key) {
// Skip if the key somehow got removed in a previous iteration (shouldn't happen here)
if (!array_key_exists($key, $permalinks)) {
continue;
}
// Split the key into ID and Type Constant
$keyParts = explode('|', $key);
if (count($keyParts) !== 2 || !is_numeric($keyParts[0])) {
$abj404logging->debugMessage("Skipping invalid key format in removeExcludedPagesWithRegex: " . $key);
continue; // Skip invalid keys
}
$id = (int)$keyParts[0];
$typeConstant = $keyParts[1]; // Keep as string/int as needed by mapTypeConstantToString
// Map the type constant (e.g., '1') to the string type ('pages', 'tags', etc.)
$rowTypeString = $this->mapTypeConstantToString($typeConstant);
if ($rowTypeString === null) {
$abj404logging->debugMessage("Skipping unknown type constant in removeExcludedPagesWithRegex: " . $typeConstant . " for key: " . $key);
continue; // Skip unknown types
}
// Get the full URL using the class's method (handles cache)
$urlOfPage = $this->getPermalink($id, $rowTypeString);
if ($urlOfPage === null || trim($urlOfPage) === '') {
$abj404logging->debugMessage("Skipping null/empty URL for key in removeExcludedPagesWithRegex: " . $key);
continue; // Skip if URL couldn't be retrieved
}
// Parse the URL and get the path, remove home directory if needed (consistency)
$urlParts = parse_url($urlOfPage);
if (!is_array($urlParts) || !isset($urlParts['path'])) {
$abj404logging->debugMessage("Skipping URL that failed parse_url for key in removeExcludedPagesWithRegex: " . $key . ", URL: " . esc_url($urlOfPage));
continue; // Skip invalid URLs
}
$pathOnly = $abj404logic->removeHomeDirectory($urlParts['path']);
// Ensure path starts with / for consistency if it's not empty
if ( $pathOnly !== '' && substr($pathOnly, 0, 1) !== '/' ) {
$pathOnly = '/' . $pathOnly;
}
// Handle case where path might be empty (e.g., homepage) which results in '/'
if ( $pathOnly === '' ) {
$pathOnly = '/';
}
$stringToMatch = $pathOnly; // The string we will match the regex against
$kept = true;
// Check against each exclusion pattern
foreach ($regexExclusions as $pattern) {
// Remove slashes like in the example provided for folders_files_ignore
$patternToExcludeNoSlashes = stripslashes($pattern);
$matches = array(); // Variable for the match results
// Use the class's regexMatch function
if ($f->regexMatch($patternToExcludeNoSlashes, $stringToMatch, $matches)) {
// Pattern matched, remove this permalink from the list
unset($permalinks[$key]);
$abj404logging->debugMessage("Regex excluded suggestion. Key: " . $key .
", Path: '" . esc_html($stringToMatch) . "', Pattern: '" . esc_html($patternToExcludeNoSlashes) . "'");
$kept = false;
// Break the inner loop (patterns), move to the next permalink key
break;
}
}
// track how many suggestions we actually need and stop filtering after we reach that count
if ($kept) {
$suggestionsKeptSoFar++;
}
if ($suggestionsKeptSoFar >= $maxCacheCount) {
break;
}
}
return $permalinks;
}
/**
* Maps internal type constants to string identifiers used by getPermalink.
* NOTE: Requires ABJ404_TYPE_* constants to be defined correctly.
*
* @param mixed $typeConstant The type constant (e.g., ABJ404_TYPE_POST).
* @return string|null The string identifier ('pages', 'tags', 'categories') or null if not found.
*/
private function mapTypeConstantToString($typeConstant) {
// Define these constants if they are not globally available or use their actual values
if (!defined('ABJ404_TYPE_POST')) define('ABJ404_TYPE_POST', '1'); // Example value
if (!defined('ABJ404_TYPE_TAG')) define('ABJ404_TYPE_TAG', '2'); // Example value
if (!defined('ABJ404_TYPE_CAT')) define('ABJ404_TYPE_CAT', '3'); // Example value
// Add other types like ABJ404_TYPE_IMAGE if needed
switch ((string)$typeConstant) { // Cast to string for reliable comparison if needed
case ABJ404_TYPE_POST:
return 'pages'; // Based on getPermalink implementation which uses 'pages' for posts
case ABJ404_TYPE_TAG:
return 'tags';
case ABJ404_TYPE_CAT:
return 'categories';
// Add 'image' case if ABJ404_TYPE_IMAGE exists and is used in $permalinks keys
// case ABJ404_TYPE_IMAGE:
// return 'image';
default:
// Log or handle unknown type
return null;
}
}
function getOnlyIDandTermID($rowsAsObject) {
$rows = array();
$objectRow = array_pop($rowsAsObject);
while ($objectRow != null) {
$rows[] = array(
'id' => property_exists($objectRow, 'id') == true ? $objectRow->id : null,
'term_id' => property_exists($objectRow, 'term_id') == true ? $objectRow->term_id : null,
'url' => property_exists($objectRow, 'url') == true ? $objectRow->url : null
);
$objectRow = array_pop($rowsAsObject);
}
return $rows;
}
function getFromPermalinkCache($requestedURL) {
// The request cache is used when the suggested pages shortcode is used.
if (array_key_exists(ABJ404_PP, $_REQUEST) && array_key_exists('permalinks_found', $_REQUEST[ABJ404_PP]) &&
!empty($_REQUEST[ABJ404_PP]['permalinks_found'])) {
$permalinks = json_decode($_REQUEST[ABJ404_PP]['permalinks_found'], true);
return $permalinks;
}
// check the database cache.
$abj404dao = ABJ_404_Solution_DataAccess::getInstance();
$returnValue = $abj404dao->getSpellingPermalinksFromCache($requestedURL);
if (!empty($returnValue)) {
return $returnValue;
}
return array();
}
function matchOnCats($permalinks, $requestedURLCleaned, $fullURLspacesCleaned, $rowType) {
$abj404dao = ABJ_404_Solution_DataAccess::getInstance();
$abj404logic = ABJ_404_Solution_PluginLogic::getInstance();
$f = ABJ_404_Solution_Functions::getInstance();
$rows = $abj404dao->getPublishedCategories();
$rows = $this->getOnlyIDandTermID($rows);
// pre-filter some pages based on the min and max possible levenshtein distances.
$likelyMatchIDsAndPermalinks = $this->getLikelyMatchIDs($requestedURLCleaned, $fullURLspacesCleaned, 'categories', $rows);
$likelyMatchIDs = array_keys($likelyMatchIDsAndPermalinks);
// access the array directly instead of using a foreach loop so we can remove items
// from the end of the array in the middle of the loop.
foreach ($likelyMatchIDs as $id) {
// use the levenshtein distance formula here.
$the_permalink = $this->getPermalink($id, 'categories');
$urlParts = parse_url($the_permalink);
$pathOnly = $abj404logic->removeHomeDirectory($urlParts['path']);
$scoreBasis = $f->strlen($pathOnly);
if ($scoreBasis == 0) {
continue;
}
$levscore = $this->customLevenshtein($requestedURLCleaned, $pathOnly);
if ($fullURLspacesCleaned != '') {
$pathOnlySpaces = $f->str_replace($this->separatingCharacters, " ", $pathOnly);
$pathOnlySpaces = trim($f->str_replace('/', " ", $pathOnlySpaces));
$levscore = min($levscore, $this->customLevenshtein($fullURLspacesCleaned, $pathOnlySpaces));
}
$onlyLastPart = $this->getLastURLPart($pathOnly);
if ($onlyLastPart != '' && $onlyLastPart != $pathOnly) {
$levscore = min($levscore, $this->customLevenshtein($requestedURLCleaned, $onlyLastPart));
}
$score = 100 - (($levscore / $scoreBasis) * 100);
$permalinks[$id . "|" . ABJ404_TYPE_CAT] = number_format($score, 4, '.', '');
}
return $permalinks;
}
function matchOnTags($permalinks, $requestedURLCleaned, $fullURLspacesCleaned, $rowType) {
$abj404dao = ABJ_404_Solution_DataAccess::getInstance();
$abj404logic = ABJ_404_Solution_PluginLogic::getInstance();
$f = ABJ_404_Solution_Functions::getInstance();
$rows = $abj404dao->getPublishedTags();
$rows = $this->getOnlyIDandTermID($rows);
// pre-filter some pages based on the min and max possible levenshtein distances.
$likelyMatchIDsAndPermalinks = $this->getLikelyMatchIDs($requestedURLCleaned, $fullURLspacesCleaned, 'tags', $rows);
$likelyMatchIDs = array_keys($likelyMatchIDsAndPermalinks);
// access the array directly instead of using a foreach loop so we can remove items
// from the end of the array in the middle of the loop.
foreach ($likelyMatchIDs as $id) {
// use the levenshtein distance formula here.
$the_permalink = $this->getPermalink($id, 'tags');
$urlParts = parse_url($the_permalink);
$pathOnly = $abj404logic->removeHomeDirectory($urlParts['path']);
$scoreBasis = $f->strlen($pathOnly);
if ($scoreBasis == 0) {
continue;
}
$levscore = $this->customLevenshtein($requestedURLCleaned, $pathOnly);
if ($fullURLspacesCleaned != '') {
$pathOnlySpaces = $f->str_replace($this->separatingCharacters, " ", $pathOnly);
$pathOnlySpaces = trim($f->str_replace('/', " ", $pathOnlySpaces));
$levscore = min($levscore, $this->customLevenshtein($fullURLspacesCleaned, $pathOnlySpaces));
}
$score = 100 - (($levscore / $scoreBasis) * 100);
$permalinks[$id . "|" . ABJ404_TYPE_TAG] = number_format($score, 4, '.', '');
}
return $permalinks;
}
function matchOnPosts($permalinks, $requestedURLRaw, $requestedURLCleaned, $fullURLspacesCleaned, $rowType) {
$abj404logic = ABJ_404_Solution_PluginLogic::getInstance();
$f = ABJ_404_Solution_Functions::getInstance();
$abj404logger = ABJ_404_Solution_Logging::getInstance();
// pre-filter some pages based on the min and max possible levenshtein distances.
$likelyMatchIDsAndPermalinks = $this->getLikelyMatchIDs($requestedURLCleaned, $fullURLspacesCleaned, $rowType);
$likelyMatchIDs = array_keys($likelyMatchIDsAndPermalinks);
$abj404logger->debugMessage("Found " . count($likelyMatchIDs) . " likely match IDs.");
// access the array directly instead of using a foreach loop so we can remove items
// from the end of the array in the middle of the loop.
while (count($likelyMatchIDs) > 0) {
$id = array_pop($likelyMatchIDs);
// use the levenshtein distance formula here.
$the_permalink = $likelyMatchIDsAndPermalinks[$id];
$urlParts = parse_url($the_permalink);
$existingPageURL = $abj404logic->removeHomeDirectory($urlParts['path']);
$existingPageURLSpaces = $f->str_replace($this->separatingCharacters, " ", $existingPageURL);
$existingPageURLCleaned = $this->getLastURLPart($existingPageURLSpaces);
$scoreBasis = $f->strlen($existingPageURLCleaned) * 3;
if ($scoreBasis == 0) {
continue;
}
$levscore = $this->customLevenshtein($requestedURLCleaned, $existingPageURLCleaned);
if ($fullURLspacesCleaned != '') {
$levscore = min($levscore, $this->customLevenshtein($fullURLspacesCleaned, $existingPageURLCleaned));
}
if ($rowType == 'image') {
// strip the image size from the file name and try again.
// the image size is at the end of the file in the format of -640x480
$strippedImageName = $f->regexReplace('(.+)([-]\d{1,5}[x]\d{1,5})([.].+)',
'\\1\\3', $requestedURLRaw);
if (($strippedImageName != null) && ($strippedImageName != $requestedURLRaw)) {
$strippedImageName = $f->str_replace($this->separatingCharactersForImages, " ", $strippedImageName);
$levscore = min($levscore, $this->customLevenshtein($strippedImageName, $existingPageURL));
$strippedImageName = $this->getLastURLPart($strippedImageName);
$levscore = min($levscore, $this->customLevenshtein($strippedImageName, $existingPageURLCleaned));
}
}
$score = 100 - (($levscore / $scoreBasis) * 100);
$permalinks[$id . "|" . ABJ404_TYPE_POST] = number_format($score, 4, '.', '');
}
return $permalinks;
}
function initializePublishedPostsProvider() {
if ($this->publishedPostsProvider == null) {
$this->publishedPostsProvider = ABJ_404_Solution_PublishedPostsProvider::getInstance();
}
$plCache = ABJ_404_Solution_PermalinkCache::getInstance();
$plCache->updatePermalinkCache(1);
}
/**
* Get the permalink for the passed in type (pages, tags, categories, image, etc.
* @param int $id
* @param string $rowType
* @return string
* @throws Exception
*/
function getPermalink($id, $rowType) {
if ($rowType == 'pages') {
$abj404dao = ABJ_404_Solution_DataAccess::getInstance();
$link = $abj404dao->getPermalinkFromCache($id);
if ($link == null || trim($link) == '') {
$link = get_the_permalink($id);
}
return urldecode($link);
} else if ($rowType == 'tags') {
return urldecode(get_tag_link($id));
} else if ($rowType == 'categories') {
return urldecode(get_category_link($id));
} else if ($rowType == 'image') {
$src = wp_get_attachment_image_src($id, "attached-image");
if ($src == false || !is_array($src)) {
return null;
}
return urldecode($src[0]);
} else {
throw new \Exception("Unknown row type ...");
}
}
/** This algorithm uses the lengths of the strings to weed out some strings before using the levenshtein
* distance formula. It uses the minimum and maximum possible levenshtein distance based on the difference in
* string length. The min distance based on length between "abc" and "def" is 0 and the max distance is 3.
* The min distance based on length between "abc" and "123456" is 3 and the max distance is 6.
* 1) Get a list of minimum and maximum levenshtein distances - two lists, one ordered by the min distance
* and one ordered by the max distance.
* 2) Get the first X strings from the max-distance list. The X is the number we have to display in the list
* of suggestions on the 404 page. Note the highest max distance of the strings we're using here.
* 3) Look at the min distance list and remove all strings where the min distance is more than the highest
* max distance taken from the previous step. The strings we remove here will always be further away than the
* strings we found in the previous step and can be removed without applying the levenshtein algorithm.
* *
* @param string $requestedURLCleaned
* @param string $fullURLspaces
* @param array $publishedPages
* @param string $rowType
* @return array
*/
function getLikelyMatchIDs($requestedURLCleaned, $fullURLspaces, $rowType, $rows = null) {
$abj404logic = ABJ_404_Solution_PluginLogic::getInstance();
$abj404logging = ABJ_404_Solution_Logging::getInstance();
$f = ABJ_404_Solution_Functions::getInstance();
$options = $abj404logic->getOptions();
// we get more than we need because the algorithm we actually use
// is not based solely on the Levenshtein distance.
$onlyNeedThisManyPages = min(5 * absint($options['suggest_max']), 100);
// create a list sorted by min levenshstein distance and max levelshtein distance.
/* 1) Get a list of minumum and maximum levenshtein distances - two lists, one ordered by the min
* distance and one ordered by the max distance. */
$minDistances = array();
$maxDistances = array();
for ($currentDistanceIndex = 0; $currentDistanceIndex <= self::MAX_DIST; $currentDistanceIndex++) {
$maxDistances[$currentDistanceIndex] = array();
$minDistances[$currentDistanceIndex] = array();
}
$requestedURLCleanedLength = $f->strlen($requestedURLCleaned);
$fullURLspacesLength = $f->strlen($fullURLspaces);
$userRequestedURLWords = explode(" ", (empty($fullURLspaces) ? $requestedURLCleaned : $fullURLspaces));
$idsWithWordsInCommon = array();
$wasntReadyCount = 0;
$idToPermalink = array();
// get the next X pages in batches until enough matches are found.
$this->publishedPostsProvider->resetBatch();
if ($rows != null) {
$this->publishedPostsProvider->useThisData($rows);
}
$currentBatch = $this->publishedPostsProvider->getNextBatch($requestedURLCleanedLength);
$row = array_pop($currentBatch);
while ($row != null) {
$row = (array)$row;
$id = null;
$the_permalink = null;
$urlParts = null;
if ($rowType == 'pages') {
$id = $row['id'];
} else if ($rowType == 'tags') {
$id = array_key_exists('term_id', $row) ? $row['term_id'] : null;
} else if ($rowType == 'categories') {
$id = array_key_exists('term_id', $row) ? $row['term_id'] : null;
} else if ($rowType == 'image') {
$id = $row['id'];
} else {
throw new \Exception("Unknown row type ... " . esc_html($rowType));
}
if (array_key_exists('url', $row)) {
$the_permalink = isset($row['url']) ? $row['url'] : '';
$the_permalink = urldecode($the_permalink);
$urlParts = parse_url($the_permalink);
if (is_bool($urlParts)) {
$abj404dao = ABJ_404_Solution_DataAccess::getInstance();
$abj404dao->removeFromPermalinkCache($id);
}
}
if (!array_key_exists('url', $row) || (isset($urlParts) && is_bool($urlParts))) {
$wasntReadyCount++;
$the_permalink = $this->getPermalink($id, $rowType);
$the_permalink = urldecode($the_permalink);
$urlParts = parse_url($the_permalink);
}
$_REQUEST[ABJ404_PP]['debug_info'] = 'Likely match IDs processing permalink: ' .
$the_permalink . ', $wasntReadyCount: ' . $wasntReadyCount;
$idToPermalink[$id] = $the_permalink;
if (!array_key_exists('path', $urlParts)) {
continue;
}
$existingPageURL = $abj404logic->removeHomeDirectory($urlParts['path']);
$urlParts = null;
// this line used to take too long to execute.
$existingPageURLSpaces = $f->str_replace($this->separatingCharacters, " ", $existingPageURL);
$existingPageURLCleaned = $this->getLastURLPart($existingPageURLSpaces);
$existingPageURLSpaces = null;
// the minimum distance is the minimum of the two possibilities. one is longer anyway, so
// it shouldn't matter.
$minDist = abs($f->strlen($existingPageURLCleaned) - $requestedURLCleanedLength);
if ($fullURLspaces != '') {
$minDist = min($minDist, abs($f->strlen($fullURLspacesLength) - $requestedURLCleanedLength));
}
$maxDist = $f->strlen($existingPageURLCleaned);
if ($fullURLspaces != '') {
$maxDist = min($maxDist, $fullURLspacesLength);
}
// -----------------
// split the links into words.
$existingPageURLCleanedWords = explode(" ", $existingPageURLCleaned);
$wordsInCommon = array_intersect($userRequestedURLWords, $existingPageURLCleanedWords);
$wordsInCommon = array_merge(array_unique($wordsInCommon, SORT_REGULAR), array());
if (count($wordsInCommon) > 0) {
// if any words match then save the link to the $idsWithWordsInCommon list.
array_push($idsWithWordsInCommon, $id);
// also lower the $maxDist accordingly.
$lengthOfTheLongestWordInCommon = max(array_map(array($f,'strlen'), $wordsInCommon));
$maxDist = $maxDist - $lengthOfTheLongestWordInCommon;
}
// -----------------
// add the ID to the list.
if (isset($minDistances[$minDist]) && is_array($minDistances[$minDist])) {
array_push($minDistances[$minDist], $id);
} else {
$minDistances[$minDist] = [$id];
}
if ($maxDist < 0) {
$abj404logging->errorMessage("maxDist is less than 0 (" . $maxDist .
") for '" . $existingPageURLCleaned . "', wordsInCommon: " .
json_encode($wordsInCommon) . ", ");
} else if ($maxDist > self::MAX_DIST) {
$maxDist = self::MAX_DIST;
}
if (is_array($maxDistances[$maxDist])) {
array_push($maxDistances[$maxDist], $id);
}
// get the next row in the current batch.
$row = array_pop($currentBatch);
if ($row == null) {
// get the best maxDistance pages and then trim the next batch using that info.
$maxAcceptableDistance = $this->getMaxAcceptableDistance($maxDistances, $onlyNeedThisManyPages);
// get the next batch if there are no more rows in the current batch.
$currentBatch = $this->publishedPostsProvider->getNextBatch(
$requestedURLCleanedLength, 1000, $maxAcceptableDistance);
$row = array_pop($currentBatch);
}
}
$_REQUEST[ABJ404_PP]['debug_info'] = '';
if ($wasntReadyCount > 0) {
$abj404logging->infoMessage("The permalink cache wasn't ready for " . $wasntReadyCount . " IDs.");
}
// look at the first X IDs with the lowest maximum levenshtein distance.
/* 2) Get the first X strings from the max-distance list. The X is the number we have to display in the
* list of suggestions on the 404 page. Note the highest max distance of the strings we're using here. */
$pagesSeenSoFar = 0;
$currentDistanceIndex = 0;
$maxDistFound = 300;
for ($currentDistanceIndex = 0; $currentDistanceIndex <= 300; $currentDistanceIndex++) {
$pagesSeenSoFar += sizeof($maxDistances[$currentDistanceIndex]);
// we only need the closest matching X pages. where X is the number of suggestions
// to display on the 404 page.
if ($pagesSeenSoFar >= $onlyNeedThisManyPages) {
$maxDistFound = $currentDistanceIndex;
break;
}
}
// now use the maxDistFound to ignore all of the pages that have a higher minimum distance
// than that number. All of those pages could never be a better match than the pages we
// have already found.
/* 3) Look at the min distance list and remove all strings where the min distance is more than the
* highest max distance taken from the previous step. The strings we remove here will always be further
* away than the strings we found in the previous step and can be removed without applying the
* levenshtein algorithm. */
$listOfIDsToReturn = array();
for ($currentDistanceIndex = 0; $currentDistanceIndex <= $maxDistFound; $currentDistanceIndex++) {
$listOfMinDistanceIDs = $minDistances[$currentDistanceIndex];
$listOfIDsToReturn = array_merge($listOfIDsToReturn, $listOfMinDistanceIDs);
}
// if there are more than X IDs to return, then only use the matches where words match.
if (count($listOfIDsToReturn) > 300 && count($idsWithWordsInCommon) >= $onlyNeedThisManyPages) {
$maybeOKguesses = array_intersect($listOfIDsToReturn, $idsWithWordsInCommon);
if (count($maybeOKguesses) >= $onlyNeedThisManyPages) {
return $maybeOKguesses;
}
return $idsWithWordsInCommon;
}
$result = array();
foreach ($listOfIDsToReturn as $id) {
if (isset($idToPermalink[$id])) {
$result[$id] = $idToPermalink[$id];
}
}
return $result;
}
/**
* @param array $maxDistances
* @param int $onlyNeedThisManyPages
* @return int the maximum acceptable distance to use when searching for similar permalinks.
*/
function getMaxAcceptableDistance($maxDistances, $onlyNeedThisManyPages) {
$pagesSeenSoFar = 0;
$currentDistanceIndex = 0;
$maxDistFound = 300;
for ($currentDistanceIndex = 0; $currentDistanceIndex <= 300; $currentDistanceIndex++) {
$pagesSeenSoFar += sizeof($maxDistances[$currentDistanceIndex]);
// we only need the closest matching X pages. where X is the number of suggestions
// to display on the 404 page.
if ($pagesSeenSoFar >= $onlyNeedThisManyPages) {
$maxDistFound = $currentDistanceIndex;
break;
}
}
// we multiply by X because the distance algorithm doesn't only use the levenshtein.
$acceptableDistance = (int)($maxDistFound * 1.1);
return $acceptableDistance;
}
/** Turns "/abc/defg" into "defg"
* @param string $url
* @return string
*/
function getLastURLPart($url) {
$parts = explode("/", $url);
for ($i = count($parts) - 1; $i >= 0; $i--) {
$lastPart = $parts[$i];
if (trim($lastPart) != "") {
break;
}
}
if (trim($lastPart) == "") {
return $url;
}
return $lastPart;
}
/**
* @param string $str
* @return array
*/
private function multiByteStringToArray($str) {
$f = ABJ_404_Solution_Functions::getInstance();
$length = $f->strlen($str);
$array = array();
for ($i = 0; $i < $length; $i++) {
$array[$i] = $f->substr($str, $i, 1);
}
return $array;
}
/** This custom levenshtein function has no 255 character limit.
* From https://www.codeproject.com/Articles/13525/Fast-memory-efficient-Levenshtein-algorithm
* @param string $str1
* @param string $str2
* @return int
* @throws Exception
*/
function customLevenshtein($str1, $str2) {
$f = ABJ_404_Solution_Functions::getInstance();
$_REQUEST[ABJ404_PP]['debug_info'] = 'customLevenshtein. str1: ' . esc_html($str1) . ', str2: ' . esc_html($str2);
$RowLen = $f->strlen($str1);
$ColLen = $f->strlen($str2);
$cost = 0;
// / Test string length. URLs should not be more than 2,083 characters
if (max($RowLen, $ColLen) > ABJ404_MAX_URL_LENGTH) {
throw new Exception("Maximum string length in customLevenshtein is " .
ABJ404_MAX_URL_LENGTH . ". Yours is " . max($RowLen, $ColLen) . ".");
}
// Step 1
if ($RowLen == 0) {
return $ColLen;
} else if ($ColLen == 0) {
return $RowLen;
}
// / Create the two vectors
$v0 = array_fill(0, $RowLen + 1, 0);
$v1 = array_fill(0, $RowLen + 1, 0);
// / Step 2
// / Initialize the first vector
for ($RowIdx = 1; $RowIdx <= $RowLen; $RowIdx++) {
$v0[$RowIdx] = $RowIdx;
}
// Step 3
// / For each column
for ($ColIdx = 1; $ColIdx <= $ColLen; $ColIdx++) {
// / Set the 0'th element to the column number
$v1[0] = $ColIdx;
// Step 4
// / For each row
for ($RowIdx = 1; $RowIdx <= $RowLen; $RowIdx++) {
$cost = ($str1[$RowIdx - 1] == $str2[$ColIdx - 1]) ? 0 : 1;
$v1[$RowIdx] = min($v0[$RowIdx] + 1, $v1[$RowIdx - 1] + 1, $v0[$RowIdx - 1] + $cost);
}
// / Swap the vectors
$vTmp = $v0;
$v0 = $v1;
$v1 = $vTmp;
}
$_REQUEST[ABJ404_PP]['debug_info'] = 'Cleared after customLevenshtein.';
return $v0[$RowLen];
}
}
🌑 DarkStealth — WP Plugin Edition
Directory: /home/httpd/html/matrixmodels.com/public_html/wp-content/plugins/404-solution/includes