Created
March 29, 2021 05:59
-
-
Save Mati365/8b9cc7f8091fd78e90cdceebe766badf to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import * as R from 'ramda'; | |
import stringSimilarity from 'string-similarity'; | |
import {createHTMLTag} from '@server/common/helpers/html/createHTMLTag'; | |
import {filterAndMap} from '@shared/helpers'; | |
import { | |
extractTextWords, | |
getHTMLInnerText, | |
isWordCharacter, | |
} from '@client/helpers/parsers'; | |
export type LinkHydrateAttrs<T = any> = { | |
text: string, | |
tags: Array<T & {name: string}>, | |
linkGeneratorFn(item: T): { | |
href: string, | |
rel?: string, | |
class?: string, | |
target?: string, | |
}, | |
}; | |
export type LinkHydrateOutput<T> = { | |
text: string, | |
tags: LinkHydrateAttrs<T>['tags'], | |
}; | |
export type TextSimilarKeywordInfo<T> = { | |
words: string[], | |
item: T & {name: string}, | |
}; | |
/** | |
* Picks similar list of sentences that are present in description | |
* | |
* @see | |
* It is very slow! Do not run it in request handler! | |
* | |
* @export | |
* @template T | |
* @param {LinkHydrateAttrs<T>['items']} items | |
* @param {string} text | |
* @returns {Record<string, TextSimilarKeywordInfo<T>[]>} | |
*/ | |
export function pickTextSimilarKeywords<T>( | |
items: LinkHydrateAttrs<T>['tags'], | |
text: string, | |
): Record<string, TextSimilarKeywordInfo<T>[]> { | |
const words = extractTextWords( | |
text.toLowerCase(), | |
); | |
const similarKeywords: [string, TextSimilarKeywordInfo<T>][] = filterAndMap( | |
(item) => { | |
const parts = extractTextWords(item.name.toLowerCase()); | |
for (let startWordIndex = 0; startWordIndex < words.length; ++startWordIndex) { | |
let matchedParts: string[] = null; | |
for ( | |
let offset = 0; | |
offset < parts.length && startWordIndex + offset < words.length; | |
++offset | |
) { | |
const part = parts[offset]; | |
const matchedWord = words[startWordIndex + offset]; | |
const rating = stringSimilarity.compareTwoStrings(part, matchedWord); | |
if (rating < 0.7) { | |
matchedParts = null; | |
break; | |
} else | |
(matchedParts ||= []).push(matchedWord); | |
} | |
if (matchedParts) { | |
return [ | |
words[startWordIndex], | |
{ | |
words: matchedParts, | |
item, | |
}, | |
]; | |
} | |
} | |
return null; | |
}, | |
items, | |
); | |
return similarKeywords.reduce( | |
(acc, item) => { | |
(acc[item[0]] ||= []).push(item[1]); | |
return acc; | |
}, | |
{}, | |
); | |
} | |
/** | |
* Injects anchors into description | |
* | |
* @warn | |
* Handle spaces! | |
* | |
* @export | |
* @template T | |
* @param {LinkHydrateAttrs<T>} attrs | |
* @returns {LinkHydrateOutput<T>} | |
*/ | |
export function hydrateTextWithLinks<T>( | |
{ | |
text, | |
tags, | |
linkGeneratorFn, | |
}: LinkHydrateAttrs<T>, | |
): LinkHydrateOutput<T> { | |
if (!tags.length || !text) { | |
return { | |
text, | |
tags: [], | |
}; | |
} | |
// prevent for matching shortest sentences inside longest sentences | |
tags = tags.sort( | |
(a, b) => b.name.length - a.name.length, | |
); | |
const similarWords = pickTextSimilarKeywords( | |
tags, | |
getHTMLInnerText(text), | |
); | |
if (R.isEmpty(similarWords)) { | |
return { | |
text, | |
tags: [], | |
}; | |
} | |
const nesting: string[] = []; | |
let wordAcc = '', output = ''; | |
const eatWords = (startIndex: number, count: number) => { | |
const words: string[] = []; | |
let i = startIndex, acc = ''; | |
for (; i < text.length && words.length < count; ++i) { | |
const c = text[i]; | |
const wordCharacter = isWordCharacter(c); | |
if (wordCharacter) | |
acc += c; | |
if (acc && (!wordCharacter || i + 1 >= text.length)) { | |
words.push(acc); | |
acc = ''; | |
} | |
} | |
return { | |
words, | |
totalCharacters: i - startIndex, | |
}; | |
}; | |
const flushWordAcc = (i: number) => { | |
if (!wordAcc) | |
return 0; | |
let anchor = false; | |
for (let j = 0; j < nesting.length; ++j) { | |
if (nesting[j] === 'a') { | |
anchor = true; | |
break; | |
} | |
} | |
let parsedWord = wordAcc; | |
let ignoredNextCharacters = 0; | |
if (!anchor) { | |
const matchedSentences = similarWords[wordAcc.toLowerCase()]; | |
if (matchedSentences) { | |
const longestSentence = matchedSentences.reduce( | |
(acc, {words}) => Math.max(acc, words.length), | |
0, | |
); | |
const longestSentenceWords = eatWords(i, longestSentence).words; | |
const longestLowerSentenceWords = longestSentenceWords.map((w) => w.toLowerCase()); | |
const sentence = matchedSentences.find(({words}) => { | |
for (let k = 1; k < words.length; ++k) { | |
if (words[k] !== longestLowerSentenceWords[k - 1]) | |
return false; | |
} | |
return true; | |
}); | |
if (sentence) { | |
const newTag = createHTMLTag( | |
'a', | |
linkGeneratorFn(sentence.item), | |
[ | |
wordAcc, | |
...longestSentenceWords.slice(0, sentence.words.length - 1), | |
].join(' '), | |
); | |
ignoredNextCharacters = sentence.words.join(' ').length - wordAcc.length - 1; | |
parsedWord = newTag; | |
} | |
} | |
} | |
output += parsedWord; | |
wordAcc = ''; | |
return ignoredNextCharacters; | |
}; | |
for (let i = 0; i < text.length; ++i) { | |
const c = text[i]; | |
const wordCharacter = isWordCharacter(c); | |
if (wordCharacter) | |
wordAcc += c; | |
else { | |
if (wordAcc) | |
i += flushWordAcc(i); | |
if (text[i] === c) | |
output += c; | |
// eat tags | |
if (c === '<') { | |
const pop = text[i + 1] === '/'; | |
if (pop) | |
nesting.pop(); | |
let tagContent = ''; | |
for (; i < text.length && text[i + 1] !== '>'; ++i, tagContent += text[i]); | |
output += tagContent; | |
if (!pop) | |
nesting.push(tagContent.split(' ')[0]); | |
} | |
} | |
} | |
flushWordAcc(text.length); | |
return { | |
text: output, | |
tags: R.unnest(R.map(R.pluck('item'), R.values(similarWords))), | |
}; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment