OkapiWordBreaker

package com.acumenvelocity.ath.common;

import com.ibm.icu.text.BreakIterator;
import net.sf.okapi.common.LocaleId;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;

/**
 * Returns positions where inline codes can be safely inserted in text.
 * Uses ICU4J word breaking for proper language support, augmented with
 * positions after punctuation and whitespace.
 *
 * Includes:
 * - Position 0 (start of text)
 * - All ICU4J word boundaries (language-aware)
 * - After each punctuation and whitespace character
 * - Position text.length() (end of text)
 *
 * <pre>
 * "Hello, world!" → [0, 5, 6, 7, 12, 13]
 * "日本語のテスト" → [0, 3, 4, 7] (CJK word breaks)
 * </pre>
 */
public final class OkapiWordBreaker {

  private static final ConcurrentHashMap<String, BreakIterator> WORD_CACHE = new ConcurrentHashMap<>();

  private OkapiWordBreaker() {
  }

  public static List<Integer> getWordBreakPositions(String text, LocaleId locId) {
    if (text == null || text.isEmpty()) {
      return Collections.emptyList();
    }
    if (locId == null) {
      throw new NullPointerException("LocaleId must not be null");
    }

    com.ibm.icu.util.ULocale uLocale = locId.toIcuLocale();
    String cacheKey = uLocale.toString();

    BreakIterator wordBreaker = WORD_CACHE.computeIfAbsent(cacheKey,
        k -> BreakIterator.getWordInstance(uLocale));

    TreeSet<Integer> positions = new TreeSet<>();

    // Always add start and end positions
    positions.add(0);
    positions.add(text.length());

    // Add all ICU4J word boundaries (handles all languages properly)
    synchronized (wordBreaker) {
      wordBreaker.setText(text);
      for (int boundary = wordBreaker.first();
           boundary != BreakIterator.DONE;
           boundary = wordBreaker.next()) {
        positions.add(boundary);
      }
    }

    // Additionally add positions after punctuation and whitespace
    // This ensures inline codes can be placed around each punctuation mark
    for (int i = 0; i < text.length(); i++) {
      char ch = text.charAt(i);
      if (Character.isWhitespace(ch) || isPunctuation(ch)) {
        positions.add(i + 1);
      }
    }

    return new ArrayList<>(positions);
  }

  /**
   * Checks if a character is punctuation using Unicode categories.
   */
  private static boolean isPunctuation(char ch) {
    int type = Character.getType(ch);
    return type == Character.DASH_PUNCTUATION
        || type == Character.START_PUNCTUATION
        || type == Character.END_PUNCTUATION
        || type == Character.CONNECTOR_PUNCTUATION
        || type == Character.OTHER_PUNCTUATION
        || type == Character.INITIAL_QUOTE_PUNCTUATION
        || type == Character.FINAL_QUOTE_PUNCTUATION;
  }
}