1 package com.acumenvelocity.ath.solr.tm;
2
3 import java.util.ArrayList;
4 import java.util.Collections;
5 import java.util.HashMap;
6 import java.util.List;
7 import java.util.Map;
8
9 import org.apache.solr.client.solrj.response.QueryResponse;
10 import org.apache.solr.common.SolrDocument;
11 import org.apache.solr.common.SolrDocumentList;
12
13 import com.acumenvelocity.ath.common.Const;
14 import com.acumenvelocity.ath.common.ConversionUtil;
15 import com.acumenvelocity.ath.common.JacksonUtil;
16 import com.acumenvelocity.ath.common.Log;
17 import com.acumenvelocity.ath.common.OkapiUtil;
18 import com.acumenvelocity.ath.common.SolrUtil;
19 import com.acumenvelocity.ath.common.exception.AthRuntimeException;
20 import com.acumenvelocity.ath.model.x.LayeredTextX;
21 import com.acumenvelocity.ath.solr.AthIndex;
22 import com.acumenvelocity.ath.solr.BigramAnalyzer;
23 import com.acumenvelocity.ath.solr.Solr;
24
25 import net.sf.okapi.common.IParameters;
26 import net.sf.okapi.common.query.MatchType;
27 import net.sf.okapi.common.query.QueryResult;
28 import net.sf.okapi.common.resource.TextFragment;
29 import net.sf.okapi.lib.translation.BaseConnector;
30 import net.sf.okapi.lib.translation.ITMQuery;
31
32 public class SolrTmConnector extends BaseConnector implements ITMQuery {
33 private Parameters params;
34 private List<QueryResult> results;
35 private int maxHits = 3;
36 private int threshold = 75;
37 private int current = -1;
38 private BigramAnalyzer bigramAnalyzer = new BigramAnalyzer();
39
40
41 public static final String[] EXACT_RETURN_FIELDS = { Const.ATH_PROP_SOURCE_JSON,
42 Const.ATH_PROP_TARGET_JSON, Const.ATH_PROP_SOLR_ID, Const.ATH_PROP_TM_SEG_ID,
43 Const.ATH_PROP_SOLR_SCORE };
44
45 public static final String[] FUZZY_RETURN_FIELDS = { Const.ATH_PROP_SOURCE,
46 Const.ATH_PROP_SOURCE_JSON, Const.ATH_PROP_TARGET, Const.ATH_PROP_TARGET_JSON,
47 Const.ATH_PROP_SOLR_ID, Const.ATH_PROP_TM_SEG_ID, Const.ATH_PROP_SOLR_SCORE };
48
49 private static final int MAX_ROWS = 500;
50
51 public SolrTmConnector() {
52 super();
53 setParameters(new Parameters());
54 setWeight(1);
55 }
56
57 @Override
58 public String getName() {
59 return "Solr TM";
60 }
61
62 @Override
63 public String getSettingsDisplay() {
64 return Log.format(
65 "\nTM ID: {}\n"
66 + "Solr core name: {}\n"
67 + "Search in the source: {}\n"
68 + "Penalize TM matches with different tags in source: {}, in target: {}\n",
69
70 net.sf.okapi.common.Util.isEmpty(params.getTmId()) ? "<To be specified>"
71 : params.getTmId(),
72
73 net.sf.okapi.common.Util.isEmpty(params.getSolrCoreName()) ? "<To be specified>"
74 : params.getSolrCoreName(),
75
76 params.getSearchInSource() ? "Yes" : "No",
77 params.getPenalizeSourceTagsDifference() ? "Yes" : "No",
78 params.getPenalizeTargetTagsDifference() ? "Yes" : "No");
79 }
80
81 @Override
82 public boolean hasNext() {
83 if (results == null) {
84 return false;
85 }
86
87 if (current >= results.size()) {
88 current = -1;
89 }
90
91 return (current > -1);
92 }
93
94 @Override
95 public QueryResult next() {
96 if (results == null) {
97 return null;
98 }
99
100 if ((current > -1) && (current < results.size())) {
101 current++;
102 return results.get(current - 1);
103 }
104
105 current = -1;
106 return null;
107 }
108
109 @Override
110 public void open() {
111
112 }
113
114 @Override
115 public void close() {
116
117 }
118
119 @Override
120 public int query(String plainText) {
121 return query(new TextFragment(plainText));
122 }
123
124 @Override
125 public int query(TextFragment queryTf) {
126
127 try {
128
129 results = new ArrayList<>();
130 current = -1;
131
132 LayeredTextX queryLt = ConversionUtil.toLayeredText(queryTf, super.srcLoc);
133 String queryText = queryTf.getText();
134 Log.trace(this.getClass(), "--- query: {}", queryText);
135 String queryCodes = queryTf.getCodes().toString();
136
137 Map<String, Object> queryParams = new HashMap<>();
138 queryParams.put(Solr.INCLUDE_SCORE, true);
139
140
141 try {
142
143
144
145
146 queryParams.put(Solr.FIELDS, EXACT_RETURN_FIELDS);
147
148
149 queryParams.put(Solr.ROWS, 1);
150 String exactId = null;
151
152
153
154 QueryResponse response = AthIndex.getMany(params.getSolrCoreName(),
155 Log.format(
156 params.getSearchInSource()
157 ? "tmId:\"{}\" AND srcLang:\"{}\" AND trgLang:\"{}\" AND sourceWithCodes:\"{}\""
158 : "tmId:\"{}\" AND srcLang:\"{}\" AND trgLang:\"{}\" AND targetWithCodes:\"{}\"",
159 params.getTmId(),
160 super.srcCode,
161 super.trgCode,
162 SolrUtil.normalizeQuery(queryLt.getTextWithCodes())),
163 queryParams, QueryResponse.class);
164
165
166 SolrDocumentList docList = response.getResults();
167 if (docList.getNumFound() > 0) {
168
169 SolrDocument doc = docList.get(0);
170
171 String srcJson = doc._getStr(Const.ATH_PROP_SOURCE_JSON, null);
172 String trgJson = doc._getStr(Const.ATH_PROP_TARGET_JSON, null);
173
174 if (srcJson != null && trgJson != null) {
175 LayeredTextX srcSc = JacksonUtil.fromJson(srcJson, LayeredTextX.class);
176
177 LayeredTextX trgSc = JacksonUtil.fromJson(trgJson, LayeredTextX.class);
178
179 if (srcSc == null || trgSc == null) {
180 Log.warn(this.getClass(), "Broken Solr document, id='{}'", doc._getStr("id", null));
181
182 } else {
183 TextFragment tmSrcTf = ConversionUtil.toTextFragment(srcSc);
184 TextFragment tmTrgTf = ConversionUtil.toTextFragment(trgSc);
185
186 QueryResult qr = new QueryResult();
187 results.add(qr);
188
189 qr.weight = super.getWeight();
190 qr.setFuzzyScore(100);
191 qr.entryId = doc._getStr(Const.ATH_PROP_SOLR_ID, null);
192 qr.source = tmSrcTf;
193 qr.target = tmTrgTf;
194 qr.matchType = MatchType.EXACT;
195 qr.origin = getName();
196
197 exactId = qr.entryId;
198
199 Log.trace(this.getClass(), "\n--- {}%\n--seg: {}\n---tm: {}", qr.getFuzzyScore(),
200 queryLt.getText(),
201 tmSrcTf.getText());
202 }
203 }
204 }
205
206
207
208
209 if (maxHits > results.size()) {
210
211
212 queryParams.put(Solr.FIELDS, FUZZY_RETURN_FIELDS);
213
214
215
216
217
218
219
220
221 queryParams.put(Solr.ROWS, MAX_ROWS);
222
223
224 response = AthIndex.getMany(params.getSolrCoreName(),
225 Log.format(
226 params.getSearchInSource()
227 ? "tmId:\"{}\" AND srcLang:\"{}\" AND trgLang:\"{}\" AND source:\"{}\""
228 : "tmId:\"{}\" AND srcLang:\"{}\" AND trgLang:\"{}\" AND target:\"{}\"",
229 params.getTmId(),
230 super.srcCode,
231 super.trgCode,
232 SolrUtil.normalizeQuery(queryLt.getText())),
233 queryParams, QueryResponse.class);
234
235
236
237
238 docList = response.getResults();
239 if (docList.getNumFound() > 0) {
240 if (docList.getNumFound() > MAX_ROWS) {
241 Log.warn(getClass(), "Solr fuzzy query returned too many documents: {}",
242 docList.getNumFound());
243 }
244
245
246
247 for (SolrDocument doc : docList) {
248 String docId = doc._getStr(Const.ATH_PROP_SOLR_ID, null);
249
250
251 if (docId != null && docId.equalsIgnoreCase(exactId)) {
252 continue;
253 }
254
255 String tmText = (String) doc.getFieldValue(
256 params.getSearchInSource() ? Const.ATH_PROP_SOURCE : Const.ATH_PROP_TARGET);
257
258 if (!net.sf.okapi.common.Util.isEmpty(tmText)) {
259 int score = Math.round(OkapiUtil.calculateNgramDiceCoefficient(
260 tmText, queryText, bigramAnalyzer));
261
262
263
264
265
266
267
268
269 String srcJson = doc._getStr(Const.ATH_PROP_SOURCE_JSON, null);
270 String trgJson = doc._getStr(Const.ATH_PROP_TARGET_JSON, null);
271
272 if (srcJson != null && trgJson != null) {
273 LayeredTextX srcSc = JacksonUtil.fromJson(srcJson, LayeredTextX.class);
274
275 LayeredTextX trgSc = JacksonUtil.fromJson(trgJson, LayeredTextX.class);
276
277 if (srcSc == null || trgSc == null) {
278 Log.warn(this.getClass(), "Broken Solr document, id='{}'",
279 doc._getStr(Const.ATH_PROP_SOLR_ID, null));
280
281 } else {
282 TextFragment tmSrcTf = ConversionUtil.toTextFragment(srcSc);
283 TextFragment tmTrgTf = ConversionUtil.toTextFragment(trgSc);
284
285
286
287
288
289
290 if (params.getPenalizeSourceTagsDifference()) {
291 if (!queryCodes.equals(tmSrcTf.getCodes().toString())) {
292 score--;
293 }
294 }
295
296 if (params.getPenalizeTargetTagsDifference()) {
297 if (!queryCodes.equals(tmTrgTf.getCodes().toString())) {
298 score--;
299 }
300 }
301
302 if (score < threshold) {
303 Log.trace(this.getClass(), "\n--- {}%\n--seg: {}\n---tm: {}", score,
304 queryLt.getText(),
305 tmSrcTf.getText());
306
307
308 continue;
309 }
310
311 QueryResult qr = new QueryResult();
312 results.add(qr);
313
314 qr.weight = super.getWeight();
315
316 if (SolrUtil.checkTmFuzzyScore(score)) {
317 qr.setFuzzyScore(score);
318 }
319
320 qr.entryId = doc._getStr(Const.ATH_PROP_SOLR_ID, null);
321 qr.source = tmSrcTf;
322 qr.target = tmTrgTf;
323 qr.matchType = score >= 100 ? MatchType.EXACT : MatchType.FUZZY;
324 qr.origin = getName();
325
326 Log.trace(this.getClass(), "\n--- {}%\n--seg: {}\n---tm: {}",
327 qr.getFuzzyScore(),
328 queryLt.getText(),
329 tmSrcTf.getText());
330 }
331 }
332 }
333 }
334 }
335
336
337 Collections.sort(results);
338 }
339
340
341
342
343 } catch (Exception e) {
344 AthRuntimeException.logAndThrow(SolrTmConnector.class, e);
345 }
346
347 if (results.size() > 0) {
348 current = 0;
349 }
350
351 return results.size();
352
353 } finally {
354
355 }
356 }
357
358 @Override
359 public void setMaximumHits(int maxHits) {
360 if (maxHits < 1) {
361 this.maxHits = 1;
362
363 } else {
364 this.maxHits = maxHits;
365 }
366 }
367
368 @Override
369 public int getMaximumHits() {
370 return maxHits;
371 }
372
373 @Override
374 public int getThreshold() {
375 return threshold;
376 }
377
378 @Override
379 public void setThreshold(int threshold) {
380 this.threshold = threshold;
381 }
382
383 @Override
384 public IParameters getParameters() {
385 return params;
386 }
387
388 @Override
389 public void setParameters(IParameters params) {
390 this.params = (Parameters) params;
391 }
392 }