1 package com.acumenvelocity.ath.common;
2
3 import com.ibm.icu.text.BreakIterator;
4 import net.sf.okapi.common.LocaleId;
5
6 import java.util.ArrayList;
7 import java.util.Collections;
8 import java.util.List;
9 import java.util.TreeSet;
10 import java.util.concurrent.ConcurrentHashMap;
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28 public final class OkapiWordBreaker {
29
30 private static final ConcurrentHashMap<String, BreakIterator> WORD_CACHE = new ConcurrentHashMap<>();
31
32 private OkapiWordBreaker() {
33 }
34
35 public static List<Integer> getWordBreakPositions(String text, LocaleId locId) {
36 if (text == null || text.isEmpty()) {
37 return Collections.emptyList();
38 }
39 if (locId == null) {
40 throw new NullPointerException("LocaleId must not be null");
41 }
42
43 com.ibm.icu.util.ULocale uLocale = locId.toIcuLocale();
44 String cacheKey = uLocale.toString();
45
46 BreakIterator wordBreaker = WORD_CACHE.computeIfAbsent(cacheKey,
47 k -> BreakIterator.getWordInstance(uLocale));
48
49 TreeSet<Integer> positions = new TreeSet<>();
50
51
52 positions.add(0);
53 positions.add(text.length());
54
55
56 synchronized (wordBreaker) {
57 wordBreaker.setText(text);
58 for (int boundary = wordBreaker.first();
59 boundary != BreakIterator.DONE;
60 boundary = wordBreaker.next()) {
61 positions.add(boundary);
62 }
63 }
64
65
66
67 for (int i = 0; i < text.length(); i++) {
68 char ch = text.charAt(i);
69 if (Character.isWhitespace(ch) || isPunctuation(ch)) {
70 positions.add(i + 1);
71 }
72 }
73
74 return new ArrayList<>(positions);
75 }
76
77
78
79
80 private static boolean isPunctuation(char ch) {
81 int type = Character.getType(ch);
82 return type == Character.DASH_PUNCTUATION
83 || type == Character.START_PUNCTUATION
84 || type == Character.END_PUNCTUATION
85 || type == Character.CONNECTOR_PUNCTUATION
86 || type == Character.OTHER_PUNCTUATION
87 || type == Character.INITIAL_QUOTE_PUNCTUATION
88 || type == Character.FINAL_QUOTE_PUNCTUATION;
89 }
90 }