View Javadoc
1   package com.acumenvelocity.ath.common;
2   
3   import com.ibm.icu.text.BreakIterator;
4   import net.sf.okapi.common.LocaleId;
5   
6   import java.util.ArrayList;
7   import java.util.Collections;
8   import java.util.List;
9   import java.util.TreeSet;
10  import java.util.concurrent.ConcurrentHashMap;
11  
12  /**
13   * Returns positions where inline codes can be safely inserted in text.
14   * Uses ICU4J word breaking for proper language support, augmented with
15   * positions after punctuation and whitespace.
16   * 
17   * Includes:
18   * - Position 0 (start of text)
19   * - All ICU4J word boundaries (language-aware)
20   * - After each punctuation and whitespace character
21   * - Position text.length() (end of text)
22   *
23   * <pre>
24   * "Hello, world!" → [0, 5, 6, 7, 12, 13]
25   * "日本語のテスト" → [0, 3, 4, 7] (CJK word breaks)
26   * </pre>
27   */
28  public final class OkapiWordBreaker {
29  
30    private static final ConcurrentHashMap<String, BreakIterator> WORD_CACHE = new ConcurrentHashMap<>();
31  
32    private OkapiWordBreaker() {
33    }
34  
35    public static List<Integer> getWordBreakPositions(String text, LocaleId locId) {
36      if (text == null || text.isEmpty()) {
37        return Collections.emptyList();
38      }
39      if (locId == null) {
40        throw new NullPointerException("LocaleId must not be null");
41      }
42  
43      com.ibm.icu.util.ULocale uLocale = locId.toIcuLocale();
44      String cacheKey = uLocale.toString();
45  
46      BreakIterator wordBreaker = WORD_CACHE.computeIfAbsent(cacheKey,
47          k -> BreakIterator.getWordInstance(uLocale));
48  
49      TreeSet<Integer> positions = new TreeSet<>();
50      
51      // Always add start and end positions
52      positions.add(0);
53      positions.add(text.length());
54      
55      // Add all ICU4J word boundaries (handles all languages properly)
56      synchronized (wordBreaker) {
57        wordBreaker.setText(text);
58        for (int boundary = wordBreaker.first();
59             boundary != BreakIterator.DONE;
60             boundary = wordBreaker.next()) {
61          positions.add(boundary);
62        }
63      }
64      
65      // Additionally add positions after punctuation and whitespace
66      // This ensures inline codes can be placed around each punctuation mark
67      for (int i = 0; i < text.length(); i++) {
68        char ch = text.charAt(i);
69        if (Character.isWhitespace(ch) || isPunctuation(ch)) {
70          positions.add(i + 1);
71        }
72      }
73  
74      return new ArrayList<>(positions);
75    }
76  
77    /**
78     * Checks if a character is punctuation using Unicode categories.
79     */
80    private static boolean isPunctuation(char ch) {
81      int type = Character.getType(ch);
82      return type == Character.DASH_PUNCTUATION
83          || type == Character.START_PUNCTUATION
84          || type == Character.END_PUNCTUATION
85          || type == Character.CONNECTOR_PUNCTUATION
86          || type == Character.OTHER_PUNCTUATION
87          || type == Character.INITIAL_QUOTE_PUNCTUATION
88          || type == Character.FINAL_QUOTE_PUNCTUATION;
89    }
90  }