Created
November 23, 2020 19:29
-
-
Save abler98/7773268d6b1dcda392123b4009867d3b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.example; | |
import java.util.*; | |
import java.util.logging.Logger; | |
import java.util.regex.Pattern; | |
public class Main { | |
// Use control characters for minimize chance of matches with original text | |
private static final String UGLY_HACK_MARKER = "\u0080\u0081\u0082"; | |
private static final String REPLACEMENT = "*"; | |
private static final int BENCHMARK_NUM_OF_ITERS = 10_000_000; | |
private final Logger logger = Logger.getLogger("main"); | |
public static void main(String[] args) { | |
new Main().run(); | |
} | |
public void run() { | |
var dictionary = Arrays.asList( | |
new DictionaryItem("при", DictionaryItem.ReplaceType.START), | |
new DictionaryItem("вет", DictionaryItem.ReplaceType.ANYWHERE), | |
new DictionaryItem("дела", DictionaryItem.ReplaceType.FULL_WORD) | |
); | |
var inputText = "Всем привет! Как дела? ~вет**~ Сюрприз?"; | |
var expectedText = "Всем *! Как *? ~***~ Сюрприз?"; | |
var calculatedResult = replaceWithCalculatedPositions(inputText, dictionary); | |
if (calculatedResult.equals(expectedText)) { | |
logger.info("Calculated result: " + calculatedResult); | |
// Avg execution time: 20790 ms | |
runBenchmark("calculated", () -> replaceWithCalculatedPositions(inputText, dictionary)); | |
} else { | |
logger.warning("Invalid calculated result: " + calculatedResult); | |
} | |
var uglyResult = replaceWithUglyHack(inputText, dictionary); | |
if (uglyResult.equals(expectedText)) { | |
logger.info("Ugly result: " + uglyResult); | |
// Avg execution time: 60793 ms | |
runBenchmark("ugly", () -> replaceWithUglyHack(inputText, dictionary)); | |
} else { | |
logger.warning("Invalid ugly result: " + uglyResult); | |
} | |
} | |
private String replaceWithCalculatedPositions(String input, List<DictionaryItem> dictionary) { | |
var matchResults = new LinkedList<WordMatch>(); | |
for (var item : dictionary) { | |
item | |
.getPattern() | |
.matcher(input) | |
.results() | |
.forEach(matchResult -> matchResults.add(new WordMatch( | |
matchResult.start(), | |
matchResult.end() | |
))); | |
} | |
matchResults.sort(Comparator.comparingInt(WordMatch::getStart)); | |
var normalizedMatchResults = new LinkedList<WordMatch>(); | |
for (var match : matchResults) { | |
var lastMatch = normalizedMatchResults.peekLast(); | |
// Join adjacent matches | |
if (lastMatch != null && lastMatch.getEnd() >= match.getStart()) { | |
normalizedMatchResults.removeLast(); | |
normalizedMatchResults.addLast(new WordMatch(lastMatch.getStart(), match.getEnd())); | |
} else { | |
normalizedMatchResults.addLast(match); | |
} | |
} | |
var result = new StringBuilder(); | |
int offset = 0; | |
for (var match : normalizedMatchResults) { | |
result.append(input, offset, match.getStart()); | |
result.append(REPLACEMENT); | |
offset = match.getEnd(); | |
} | |
result.append(input.substring(offset)); | |
return result.toString(); | |
} | |
// Simple but slow | |
private String replaceWithUglyHack(String input, List<DictionaryItem> dictionary) { | |
var wordBoundary = "(?<!" + UGLY_HACK_MARKER + ")(?:\\b)"; | |
for (var item : dictionary) { | |
input = item | |
.getPattern(wordBoundary) | |
.matcher(input) | |
.replaceAll(UGLY_HACK_MARKER + "$1" + UGLY_HACK_MARKER); | |
} | |
return input | |
.replaceAll("(" + UGLY_HACK_MARKER + "){2,}", "") | |
.replaceAll(UGLY_HACK_MARKER + "(.+?)" + UGLY_HACK_MARKER, REPLACEMENT); | |
} | |
private void runBenchmark(String label, Runnable runnable) { | |
var startTime = System.nanoTime(); | |
for (int i = 0; i < BENCHMARK_NUM_OF_ITERS; i++) { | |
runnable.run(); | |
} | |
var elapsedTimeMs = (System.nanoTime() - startTime) / 1e6; | |
logger.info("Benchmark [" + label + "]: " + elapsedTimeMs + " ms"); | |
} | |
private static class WordMatch { | |
private final int start; | |
private final int end; | |
public WordMatch(int start, int end) { | |
this.start = start; | |
this.end = end; | |
} | |
public int getStart() { | |
return start; | |
} | |
public int getEnd() { | |
return end; | |
} | |
} | |
private static class DictionaryItem { | |
public enum ReplaceType { | |
START, | |
ANYWHERE, | |
FULL_WORD, | |
} | |
private final Map<String, Pattern> patternCache = new HashMap<>(); | |
private final String value; | |
private final ReplaceType replaceType; | |
public DictionaryItem(String value, ReplaceType replaceType) { | |
this.value = value; | |
this.replaceType = replaceType; | |
} | |
public Pattern getPattern() { | |
return getPattern("\\b"); | |
} | |
public Pattern getPattern(String wordBoundary) { | |
return patternCache.computeIfAbsent(wordBoundary, (key) -> { | |
var quotedValue = Pattern.quote(value); | |
var regex = switch (replaceType) { | |
case START -> wordBoundary + quotedValue; | |
case ANYWHERE -> quotedValue; | |
case FULL_WORD -> wordBoundary + quotedValue + wordBoundary; | |
}; | |
return Pattern.compile( | |
"(" + regex + ")", | |
Pattern.UNICODE_CASE | Pattern.CASE_INSENSITIVE | |
); | |
}); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment