1 package com.acumenvelocity.ath.common;
2
3 import java.io.IOException;
4 import java.io.StringReader;
5 import java.util.ArrayList;
6 import java.util.HashMap;
7 import java.util.HashSet;
8 import java.util.List;
9 import java.util.Map;
10 import java.util.Set;
11
12 import org.apache.lucene.analysis.Analyzer;
13 import org.apache.lucene.analysis.TokenStream;
14 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
15 import org.slf4j.Logger;
16 import org.slf4j.LoggerFactory;
17
18 import com.google.common.io.Files;
19
20 import net.sf.okapi.common.LocaleId;
21 import net.sf.okapi.common.MimeTypeMapper;
22 import net.sf.okapi.common.annotation.AltTranslation;
23 import net.sf.okapi.common.annotation.AltTranslationsAnnotation;
24 import net.sf.okapi.common.exceptions.OkapiException;
25 import net.sf.okapi.common.resource.Code;
26 import net.sf.okapi.common.resource.ISegments;
27 import net.sf.okapi.common.resource.ITextUnit;
28 import net.sf.okapi.common.resource.Segment;
29 import net.sf.okapi.common.resource.TextContainer;
30 import net.sf.okapi.common.resource.TextFragment;
31 import net.sf.okapi.common.resource.TextUnitUtil;
32
33 public class OkapiUtil {
34
35
36
37
38
39
40
41
42
43 public static float calculateNgramDiceCoefficient(String st1, String st2,
44 Analyzer analyzer) {
45
46 Set<String> st1Tokens = new HashSet<>();
47 Set<String> st2Tokens = new HashSet<>();
48
49 try (TokenStream ts1 = analyzer.tokenStream(null, new StringReader(st1))) {
50 ts1.reset();
51
52 while (ts1.incrementToken()) {
53 st1Tokens.add(ts1.getAttribute(CharTermAttribute.class).toString());
54 }
55
56 ts1.end();
57
58 } catch (IOException e) {
59 throw new OkapiException("Error tokenizing source TextUnits", e);
60 }
61
62 try (TokenStream ts2 = analyzer.tokenStream(null, new StringReader(st2))) {
63 ts2.reset();
64
65 while (ts2.incrementToken()) {
66 st2Tokens.add(ts2.getAttribute(CharTermAttribute.class).toString());
67 }
68
69 ts2.end();
70
71 } catch (IOException e) {
72 throw new OkapiException("Error tokenizing source TextUnits", e);
73 }
74
75
76 int st1Size = st1Tokens.size();
77 int st2Size = st2Tokens.size();
78 st1Tokens.retainAll(st2Tokens);
79 int intersection = st1Tokens.size();
80 return ((2.0f * intersection)) / (st1Size + st2Size) * 100.0f;
81 }
82
83 public static TextContainer safeGetTarget(ITextUnit tu, LocaleId trgLoc) {
84 try {
85 return (trgLoc != null) ? tu.getTarget(trgLoc) : null;
86
87 } catch (Exception e) {
88 return null;
89 }
90 }
91
92
93
94
95
96
97
98 public static void removeCodes(ITextUnit textUnit, boolean removeTargetCodes) {
99 Logger localLogger = LoggerFactory.getLogger(TextUnitUtil.class);
100 if (textUnit == null) {
101 localLogger.warn("Text unit is null.");
102 return;
103 }
104
105
106 TextContainer stc = textUnit.getSource();
107 removeCodes(stc);
108
109
110 if (removeTargetCodes && !textUnit.getTargetLocales().isEmpty()) {
111 for (LocaleId locale : textUnit.getTargetLocales()) {
112 TextContainer ttc = textUnit.getTarget(locale);
113 removeCodes(ttc);
114 }
115 }
116 }
117
118 public static void removeCodes(TextContainer tc) {
119 ISegments segs = tc.getSegments();
120
121 for (Segment seg : segs) {
122 TextUnitUtil.removeCodes(seg.text);
123 }
124 }
125
126
127
128
129
130
131
132
133
134 public static void removeExtraCodes(List<Code> srcCodes, TextFragment targetTf) {
135 if (srcCodes == null || targetTf == null || !targetTf.hasCode()) {
136 return;
137 }
138
139
140 List<Code> trgCodes = new ArrayList<>(targetTf.getCodes());
141
142
143 for (Code trgCode : trgCodes) {
144 boolean foundInSource = false;
145
146
147 for (Code srcCode : srcCodes) {
148 if (srcCode.getId() == trgCode.getId() &&
149 srcCode.getTagType() == trgCode.getTagType()) {
150 foundInSource = true;
151 break;
152 }
153 }
154
155
156 if (!foundInSource) {
157 targetTf.removeCode(trgCode);
158 }
159 }
160 }
161
162 public static void setAlOrigin(ITextUnit tu, LocaleId srcLoc, LocaleId trgLoc) {
163 TextContainer target = tu.getTarget(trgLoc);
164
165 if (target == null) {
166 return;
167 }
168
169 ISegments tsegs = target.getSegments();
170
171 for (Segment tseg : tsegs) {
172 AltTranslationsAnnotation ata = tseg.getAnnotation(AltTranslationsAnnotation.class);
173
174 if (ata == null) {
175 ata = new AltTranslationsAnnotation();
176 }
177
178
179 AltTranslation at = new AltTranslation(srcLoc, trgLoc, tu,
180 Const.AL_MATCH_TYPE, 100, null);
181
182 ata.add(at);
183 tseg.setAnnotation(ata);
184 }
185 }
186
187 public static String getMimeType(String fileName) {
188 return MimeTypeMapper.getMimeType(Files.getFileExtension(fileName));
189 }
190
191 public static void rearrangeCodes(List<Code> codes, TextFragment targetTf) {
192 if (codes == null || targetTf == null || !targetTf.hasCode()) {
193 return;
194 }
195
196
197
198
199 boolean needsRearrangement = true;
200 int maxIterations = 10;
201 int iteration = 0;
202
203 while (needsRearrangement && iteration < maxIterations) {
204 needsRearrangement = false;
205 iteration++;
206
207 String codedText = targetTf.getCodedText();
208
209
210 Map<Integer, Map<TextFragment.TagType, Integer>> positionMap = buildPositionMap(targetTf,
211 codedText);
212
213
214 List<Code> targetCodes = new ArrayList<>(targetTf.getCodes());
215 for (Code code : targetCodes) {
216 if (code.getTagType() == TextFragment.TagType.OPENING) {
217 int codeId = code.getId();
218
219
220 Map<TextFragment.TagType, Integer> positions = positionMap.get(codeId);
221 if (positions == null)
222 continue;
223
224 Integer openPos = positions.get(TextFragment.TagType.OPENING);
225 Integer closePos = positions.get(TextFragment.TagType.CLOSING);
226
227
228 if (openPos != null && closePos != null && closePos < openPos) {
229
230 swapMarkers(targetTf, codedText, openPos, closePos);
231 needsRearrangement = true;
232 break;
233 }
234 }
235 }
236 }
237
238
239
240 }
241
242
243
244
245 private static Map<Integer, Map<TextFragment.TagType, Integer>> buildPositionMap(
246 TextFragment tf, String codedText) {
247
248 Map<Integer, Map<TextFragment.TagType, Integer>> positionMap = new HashMap<>();
249
250 for (int i = 0; i < codedText.length(); i++) {
251 char ch = codedText.charAt(i);
252 if (TextFragment.isMarker(ch)) {
253 int codeIndex = TextFragment.toIndex(codedText.charAt(i + 1));
254 Code code = tf.getCode(codeIndex);
255 int codeId = code.getId();
256
257
258 Map<TextFragment.TagType, Integer> typeMap = positionMap.computeIfAbsent(
259 codeId, k -> new HashMap<>());
260
261
262 typeMap.put(code.getTagType(), i);
263
264 i++;
265 }
266 }
267
268 return positionMap;
269 }
270
271
272
273
274 private static void swapMarkers(TextFragment targetTf, String codedText,
275 int openPos, int closePos) {
276
277
278 int closeMarkerEnd = closePos + 2;
279 int openMarkerStart = openPos;
280 int openMarkerEnd = openPos + 2;
281
282 String beforeClose = codedText.substring(0, closePos);
283 String betweenCodes = codedText.substring(closeMarkerEnd, openMarkerStart);
284 String afterOpen = codedText.substring(openMarkerEnd);
285
286
287 char closeMarkerType = codedText.charAt(closePos);
288 char closeMarkerIndex = codedText.charAt(closePos + 1);
289 char openMarkerType = codedText.charAt(openPos);
290 char openMarkerIndex = codedText.charAt(openPos + 1);
291
292
293 StringBuilder newCodedText = new StringBuilder();
294 newCodedText.append(beforeClose);
295 newCodedText.append(openMarkerType).append(openMarkerIndex);
296 newCodedText.append(betweenCodes);
297 newCodedText.append(closeMarkerType).append(closeMarkerIndex);
298 newCodedText.append(afterOpen);
299
300
301
302 targetTf.setCodedText(newCodedText.toString(), targetTf.getCodes(), true);
303 }
304 }