abler98 · November 23, 2020 19:29
diff --git a/DictionaryReplace.java b/DictionaryReplace.java
 package com.example;

 import java.util.*;
 import java.util.logging.Logger;
 import java.util.regex.Pattern;

 public class Main {

    // Use control characters for minimize chance of matches with original text
    private static final String UGLY_HACK_MARKER = "\u0080\u0081\u0082";
    private static final String REPLACEMENT = "*";
    private static final int BENCHMARK_NUM_OF_ITERS = 10_000_000;

    private final Logger logger = Logger.getLogger("main");

    public static void main(String[] args) {
        new Main().run();
    }

    public void run() {
        var dictionary = Arrays.asList(
                new DictionaryItem("при", DictionaryItem.ReplaceType.START),
                new DictionaryItem("вет", DictionaryItem.ReplaceType.ANYWHERE),
                new DictionaryItem("дела", DictionaryItem.ReplaceType.FULL_WORD)
        );

        var inputText = "Всем привет! Как дела? ~вет**~ Сюрприз?";
        var expectedText = "Всем *! Как *? ~***~ Сюрприз?";

        var calculatedResult = replaceWithCalculatedPositions(inputText, dictionary);

        if (calculatedResult.equals(expectedText)) {
            logger.info("Calculated result: " + calculatedResult);

            // Avg execution time: 20790 ms
            runBenchmark("calculated", () -> replaceWithCalculatedPositions(inputText, dictionary));
        } else {
            logger.warning("Invalid calculated result: " + calculatedResult);
        }

        var uglyResult = replaceWithUglyHack(inputText, dictionary);

        if (uglyResult.equals(expectedText)) {
            logger.info("Ugly result: " + uglyResult);

            // Avg execution time: 60793 ms
            runBenchmark("ugly", () -> replaceWithUglyHack(inputText, dictionary));
        } else {
            logger.warning("Invalid ugly result: " + uglyResult);
        }
    }

    private String replaceWithCalculatedPositions(String input, List<DictionaryItem> dictionary) {
        var matchResults = new LinkedList<WordMatch>();

        for (var item : dictionary) {
            item
                    .getPattern()
                    .matcher(input)
                    .results()
                    .forEach(matchResult -> matchResults.add(new WordMatch(
                            matchResult.start(),
                            matchResult.end()
                    )));
        }

        matchResults.sort(Comparator.comparingInt(WordMatch::getStart));

        var normalizedMatchResults = new LinkedList<WordMatch>();

        for (var match : matchResults) {
            var lastMatch = normalizedMatchResults.peekLast();

            // Join adjacent matches
            if (lastMatch != null && lastMatch.getEnd() >= match.getStart()) {
                normalizedMatchResults.removeLast();
                normalizedMatchResults.addLast(new WordMatch(lastMatch.getStart(), match.getEnd()));
            } else {
                normalizedMatchResults.addLast(match);
            }
        }

        var result = new StringBuilder();

        int offset = 0;

        for (var match : normalizedMatchResults) {
            result.append(input, offset, match.getStart());
            result.append(REPLACEMENT);
            offset = match.getEnd();
        }

        result.append(input.substring(offset));

        return result.toString();
    }

    // Simple but slow
    private String replaceWithUglyHack(String input, List<DictionaryItem> dictionary) {
        var wordBoundary = "(?<!" + UGLY_HACK_MARKER + ")(?:\\b)";

        for (var item : dictionary) {
            input = item
                    .getPattern(wordBoundary)
                    .matcher(input)
                    .replaceAll(UGLY_HACK_MARKER + "$1" + UGLY_HACK_MARKER);
        }

        return input
                .replaceAll("(" + UGLY_HACK_MARKER + "){2,}", "")
                .replaceAll(UGLY_HACK_MARKER + "(.+?)" + UGLY_HACK_MARKER, REPLACEMENT);
    }

    private void runBenchmark(String label, Runnable runnable) {
        var startTime = System.nanoTime();

        for (int i = 0; i < BENCHMARK_NUM_OF_ITERS; i++) {
            runnable.run();
        }

        var elapsedTimeMs = (System.nanoTime() - startTime) / 1e6;

        logger.info("Benchmark [" + label + "]: " + elapsedTimeMs + " ms");
    }

    private static class WordMatch {

        private final int start;
        private final int end;

        public WordMatch(int start, int end) {
            this.start = start;
            this.end = end;
        }

        public int getStart() {
            return start;
        }

        public int getEnd() {
            return end;
        }
    }

    private static class DictionaryItem {

        public enum ReplaceType {
            START,
            ANYWHERE,
            FULL_WORD,
        }

        private final Map<String, Pattern> patternCache = new HashMap<>();

        private final String value;
        private final ReplaceType replaceType;

        public DictionaryItem(String value, ReplaceType replaceType) {
            this.value = value;
            this.replaceType = replaceType;
        }

        public Pattern getPattern() {
            return getPattern("\\b");
        }

        public Pattern getPattern(String wordBoundary) {
            return patternCache.computeIfAbsent(wordBoundary, (key) -> {
                var quotedValue = Pattern.quote(value);

                var regex = switch (replaceType) {
                    case START -> wordBoundary + quotedValue;
                    case ANYWHERE -> quotedValue;
                    case FULL_WORD -> wordBoundary + quotedValue + wordBoundary;
                };

                return Pattern.compile(
                        "(" + regex + ")",
                        Pattern.UNICODE_CASE | Pattern.CASE_INSENSITIVE
                );
            });
        }
    }
 }
	package com.example;

	import java.util.*;
	import java.util.logging.Logger;
	import java.util.regex.Pattern;

	public class Main {

	// Use control characters for minimize chance of matches with original text
	private static final String UGLY_HACK_MARKER = "\u0080\u0081\u0082";
	private static final String REPLACEMENT = "*";
	private static final int BENCHMARK_NUM_OF_ITERS = 10_000_000;

	private final Logger logger = Logger.getLogger("main");

	public static void main(String[] args) {
	new Main().run();
	}

	public void run() {
	var dictionary = Arrays.asList(
	new DictionaryItem("при", DictionaryItem.ReplaceType.START),
	new DictionaryItem("вет", DictionaryItem.ReplaceType.ANYWHERE),
	new DictionaryItem("дела", DictionaryItem.ReplaceType.FULL_WORD)
	);

	var inputText = "Всем привет! Как дела? ~вет**~ Сюрприз?";
	var expectedText = "Всем ! Как ? ~***~ Сюрприз?";

	var calculatedResult = replaceWithCalculatedPositions(inputText, dictionary);

	if (calculatedResult.equals(expectedText)) {
	logger.info("Calculated result: " + calculatedResult);

	// Avg execution time: 20790 ms
	runBenchmark("calculated", () -> replaceWithCalculatedPositions(inputText, dictionary));
	} else {
	logger.warning("Invalid calculated result: " + calculatedResult);
	}

	var uglyResult = replaceWithUglyHack(inputText, dictionary);

	if (uglyResult.equals(expectedText)) {
	logger.info("Ugly result: " + uglyResult);

	// Avg execution time: 60793 ms
	runBenchmark("ugly", () -> replaceWithUglyHack(inputText, dictionary));
	} else {
	logger.warning("Invalid ugly result: " + uglyResult);
	}
	}

	private String replaceWithCalculatedPositions(String input, List<DictionaryItem> dictionary) {
	var matchResults = new LinkedList<WordMatch>();

	for (var item : dictionary) {
	item
	.getPattern()
	.matcher(input)
	.results()
	.forEach(matchResult -> matchResults.add(new WordMatch(
	matchResult.start(),
	matchResult.end()
	)));
	}

	matchResults.sort(Comparator.comparingInt(WordMatch::getStart));

	var normalizedMatchResults = new LinkedList<WordMatch>();

	for (var match : matchResults) {
	var lastMatch = normalizedMatchResults.peekLast();

	// Join adjacent matches
	if (lastMatch != null && lastMatch.getEnd() >= match.getStart()) {
	normalizedMatchResults.removeLast();
	normalizedMatchResults.addLast(new WordMatch(lastMatch.getStart(), match.getEnd()));
	} else {
	normalizedMatchResults.addLast(match);
	}
	}

	var result = new StringBuilder();

	int offset = 0;

	for (var match : normalizedMatchResults) {
	result.append(input, offset, match.getStart());
	result.append(REPLACEMENT);
	offset = match.getEnd();
	}

	result.append(input.substring(offset));

	return result.toString();
	}

	// Simple but slow
	private String replaceWithUglyHack(String input, List<DictionaryItem> dictionary) {
	var wordBoundary = "(?<!" + UGLY_HACK_MARKER + ")(?:\\b)";

	for (var item : dictionary) {
	input = item
	.getPattern(wordBoundary)
	.matcher(input)
	.replaceAll(UGLY_HACK_MARKER + "$1" + UGLY_HACK_MARKER);
	}

	return input
	.replaceAll("(" + UGLY_HACK_MARKER + "){2,}", "")
	.replaceAll(UGLY_HACK_MARKER + "(.+?)" + UGLY_HACK_MARKER, REPLACEMENT);
	}

	private void runBenchmark(String label, Runnable runnable) {
	var startTime = System.nanoTime();

	for (int i = 0; i < BENCHMARK_NUM_OF_ITERS; i++) {
	runnable.run();
	}

	var elapsedTimeMs = (System.nanoTime() - startTime) / 1e6;

	logger.info("Benchmark [" + label + "]: " + elapsedTimeMs + " ms");
	}

	private static class WordMatch {

	private final int start;
	private final int end;

	public WordMatch(int start, int end) {
	this.start = start;
	this.end = end;
	}

	public int getStart() {
	return start;
	}

	public int getEnd() {
	return end;
	}
	}

	private static class DictionaryItem {

	public enum ReplaceType {
	START,
	ANYWHERE,
	FULL_WORD,
	}

	private final Map<String, Pattern> patternCache = new HashMap<>();

	private final String value;
	private final ReplaceType replaceType;

	public DictionaryItem(String value, ReplaceType replaceType) {
	this.value = value;
	this.replaceType = replaceType;
	}

	public Pattern getPattern() {
	return getPattern("\\b");
	}

	public Pattern getPattern(String wordBoundary) {
	return patternCache.computeIfAbsent(wordBoundary, (key) -> {
	var quotedValue = Pattern.quote(value);

	var regex = switch (replaceType) {
	case START -> wordBoundary + quotedValue;
	case ANYWHERE -> quotedValue;
	case FULL_WORD -> wordBoundary + quotedValue + wordBoundary;
	};

	return Pattern.compile(
	"(" + regex + ")",
	Pattern.UNICODE_CASE \| Pattern.CASE_INSENSITIVE
	);
	});
	}
	}
	}