View Javadoc
1   package com.acumenvelocity.ath.solr.tm;
2   
3   import java.util.ArrayList;
4   import java.util.Collections;
5   import java.util.HashMap;
6   import java.util.List;
7   import java.util.Map;
8   
9   import org.apache.solr.client.solrj.response.QueryResponse;
10  import org.apache.solr.common.SolrDocument;
11  import org.apache.solr.common.SolrDocumentList;
12  
13  import com.acumenvelocity.ath.common.Const;
14  import com.acumenvelocity.ath.common.ConversionUtil;
15  import com.acumenvelocity.ath.common.JacksonUtil;
16  import com.acumenvelocity.ath.common.Log;
17  import com.acumenvelocity.ath.common.OkapiUtil;
18  import com.acumenvelocity.ath.common.SolrUtil;
19  import com.acumenvelocity.ath.common.exception.AthRuntimeException;
20  import com.acumenvelocity.ath.model.x.LayeredTextX;
21  import com.acumenvelocity.ath.solr.AthIndex;
22  import com.acumenvelocity.ath.solr.BigramAnalyzer;
23  import com.acumenvelocity.ath.solr.Solr;
24  
25  import net.sf.okapi.common.IParameters;
26  import net.sf.okapi.common.query.MatchType;
27  import net.sf.okapi.common.query.QueryResult;
28  import net.sf.okapi.common.resource.TextFragment;
29  import net.sf.okapi.lib.translation.BaseConnector;
30  import net.sf.okapi.lib.translation.ITMQuery;
31  
32  public class SolrTmConnector extends BaseConnector implements ITMQuery {
33    private Parameters params;
34    private List<QueryResult> results;
35    private int maxHits = 3;
36    private int threshold = 75;
37    private int current = -1;
38    private BigramAnalyzer bigramAnalyzer = new BigramAnalyzer();
39  
40    // The fields to be returned
41    public static final String[] EXACT_RETURN_FIELDS = { Const.ATH_PROP_SOURCE_JSON,
42        Const.ATH_PROP_TARGET_JSON, Const.ATH_PROP_SOLR_ID, Const.ATH_PROP_TM_SEG_ID,
43        Const.ATH_PROP_SOLR_SCORE };
44  
45    public static final String[] FUZZY_RETURN_FIELDS = { Const.ATH_PROP_SOURCE,
46        Const.ATH_PROP_SOURCE_JSON, Const.ATH_PROP_TARGET, Const.ATH_PROP_TARGET_JSON,
47        Const.ATH_PROP_SOLR_ID, Const.ATH_PROP_TM_SEG_ID, Const.ATH_PROP_SOLR_SCORE };
48  
49    private static final int MAX_ROWS = 500;
50  
51    public SolrTmConnector() {
52      super();
53      setParameters(new Parameters());
54      setWeight(1);
55    }
56  
57    @Override
58    public String getName() {
59      return "Solr TM";
60    }
61  
62    @Override
63    public String getSettingsDisplay() {
64      return Log.format(
65          "\nTM ID: {}\n"
66              + "Solr core name: {}\n"
67              + "Search in the source: {}\n"
68              + "Penalize TM matches with different tags in source: {}, in target: {}\n",
69  
70          net.sf.okapi.common.Util.isEmpty(params.getTmId()) ? "<To be specified>"
71              : params.getTmId(),
72  
73          net.sf.okapi.common.Util.isEmpty(params.getSolrCoreName()) ? "<To be specified>"
74              : params.getSolrCoreName(),
75  
76          params.getSearchInSource() ? "Yes" : "No",
77          params.getPenalizeSourceTagsDifference() ? "Yes" : "No",
78          params.getPenalizeTargetTagsDifference() ? "Yes" : "No");
79    }
80  
81    @Override
82    public boolean hasNext() {
83      if (results == null) {
84        return false;
85      }
86  
87      if (current >= results.size()) {
88        current = -1;
89      }
90  
91      return (current > -1);
92    }
93  
94    @Override
95    public QueryResult next() {
96      if (results == null) {
97        return null;
98      }
99  
100     if ((current > -1) && (current < results.size())) {
101       current++;
102       return results.get(current - 1);
103     }
104 
105     current = -1;
106     return null;
107   }
108 
109   @Override
110   public void open() {
111     // Nothing
112   }
113 
114   @Override
115   public void close() {
116     // Nothing
117   }
118 
119   @Override
120   public int query(String plainText) {
121     return query(new TextFragment(plainText));
122   }
123 
124   @Override
125   public int query(TextFragment queryTf) {
126     // P.s(Const.PROFILER_ZONE_TM_CONNECTOR, "Solr connector");
127     try {
128       // P.s(15, "Prepare search");
129       results = new ArrayList<>();
130       current = -1;
131 
132       LayeredTextX queryLt = ConversionUtil.toLayeredText(queryTf, super.srcLoc);
133       String queryText = queryTf.getText();
134       Log.trace(this.getClass(), "--- query: {}", queryText);
135       String queryCodes = queryTf.getCodes().toString();
136 
137       Map<String, Object> queryParams = new HashMap<>();
138       queryParams.put(Solr.INCLUDE_SCORE, true);
139       // P.e(15);
140 
141       try {
142         // P.s(17, "Both exact & fuzzy");
143         // P.s(Const.PROFILER_ZONE_TM_SOLR_EXACT, "Solr exact");
144 
145         // Query Solr for an exact match of the text with tags
146         queryParams.put(Solr.FIELDS, EXACT_RETURN_FIELDS);
147 
148         // XXX We need only one exact match, only single target TMs are supported
149         queryParams.put(Solr.ROWS, 1);
150         String exactId = null;
151 
152         // Ignore those with hidden=true (for MT cache use only)
153         // P.s(18, "Solr exact query");
154         QueryResponse response = AthIndex.getMany(params.getSolrCoreName(),
155             Log.format(
156                 params.getSearchInSource()
157                     ? "tmId:\"{}\" AND srcLang:\"{}\" AND trgLang:\"{}\" AND sourceWithCodes:\"{}\""
158                     : "tmId:\"{}\" AND srcLang:\"{}\" AND trgLang:\"{}\" AND targetWithCodes:\"{}\"",
159                 params.getTmId(),
160                 super.srcCode,
161                 super.trgCode,
162                 SolrUtil.normalizeQuery(queryLt.getTextWithCodes())),
163             queryParams, QueryResponse.class);
164         // P.e(18);
165 
166         SolrDocumentList docList = response.getResults();
167         if (docList.getNumFound() > 0) {
168           // Exact match
169           SolrDocument doc = docList.get(0);
170 
171           String srcJson = doc._getStr(Const.ATH_PROP_SOURCE_JSON, null);
172           String trgJson = doc._getStr(Const.ATH_PROP_TARGET_JSON, null);
173 
174           if (srcJson != null && trgJson != null) {
175             LayeredTextX srcSc = JacksonUtil.fromJson(srcJson, LayeredTextX.class);
176 
177             LayeredTextX trgSc = JacksonUtil.fromJson(trgJson, LayeredTextX.class);
178 
179             if (srcSc == null || trgSc == null) {
180               Log.warn(this.getClass(), "Broken Solr document, id='{}'", doc._getStr("id", null));
181 
182             } else {
183               TextFragment tmSrcTf = ConversionUtil.toTextFragment(srcSc);
184               TextFragment tmTrgTf = ConversionUtil.toTextFragment(trgSc);
185 
186               QueryResult qr = new QueryResult();
187               results.add(qr);
188 
189               qr.weight = super.getWeight();
190               qr.setFuzzyScore(100);
191               qr.entryId = doc._getStr(Const.ATH_PROP_SOLR_ID, null);
192               qr.source = tmSrcTf;
193               qr.target = tmTrgTf;
194               qr.matchType = MatchType.EXACT;
195               qr.origin = getName();
196 
197               exactId = qr.entryId;
198 
199               Log.trace(this.getClass(), "\n--- {}%\n--seg: {}\n---tm: {}", qr.getFuzzyScore(),
200                   queryLt.getText(),
201                   tmSrcTf.getText());
202             }
203           }
204         }
205 
206         // P.e(Const.PROFILER_ZONE_TM_SOLR_EXACT);
207         // P.s(Const.PROFILER_ZONE_TM_SOLR_FUZZY, "Solr fuzzy");
208 
209         if (maxHits > results.size()) {
210           // Query Solr fuzzily for the text without tags
211           // XXX We return only the highest score fuzzy match
212           queryParams.put(Solr.FIELDS, FUZZY_RETURN_FIELDS);
213 
214           // // +1 for a duplicate exact match among fuzzy ones
215           // int numFuzzies = maxHits - results.size();
216           // if (results.size() > 0) {
217           // numFuzzies++;
218           // }
219 
220           // queryParams.put(Solr.ROWS, numFuzzies);
221           queryParams.put(Solr.ROWS, MAX_ROWS);
222           // Ignore those with hidden=true (for MT cache use only)
223           // P.s(19, "Solr fuzzy query");
224           response = AthIndex.getMany(params.getSolrCoreName(),
225               Log.format(
226                   params.getSearchInSource()
227                       ? "tmId:\"{}\" AND srcLang:\"{}\" AND trgLang:\"{}\" AND source:\"{}\""
228                       : "tmId:\"{}\" AND srcLang:\"{}\" AND trgLang:\"{}\" AND target:\"{}\"",
229                   params.getTmId(),
230                   super.srcCode,
231                   super.trgCode,
232                   SolrUtil.normalizeQuery(queryLt.getText())),
233               queryParams, QueryResponse.class);
234           // P.e(19);
235 
236           // P.c(16, "QTime", response.getQTime(), queryText);
237 
238           docList = response.getResults();
239           if (docList.getNumFound() > 0) {
240             if (docList.getNumFound() > MAX_ROWS) {
241               Log.warn(getClass(), "Solr fuzzy query returned too many documents: {}",
242                   docList.getNumFound());
243             }
244             // int maxScore = 0;
245             // SolrDocument maxScoreDoc = null;
246 
247             for (SolrDocument doc : docList) {
248               String docId = doc._getStr(Const.ATH_PROP_SOLR_ID, null);
249 
250               // Don't add to results what has been matched exactly before
251               if (docId != null && docId.equalsIgnoreCase(exactId)) {
252                 continue;
253               }
254 
255               String tmText = (String) doc.getFieldValue(
256                   params.getSearchInSource() ? Const.ATH_PROP_SOURCE : Const.ATH_PROP_TARGET);
257 
258               if (!net.sf.okapi.common.Util.isEmpty(tmText)) {
259                 int score = Math.round(OkapiUtil.calculateNgramDiceCoefficient(
260                     tmText, queryText, bigramAnalyzer));
261 
262                 // // XXX If there are several matches with the same score, the first one is
263                 // kept
264                 // if (maxScore < score) {
265                 // maxScore = score;
266                 // maxScoreDoc = doc;
267                 // }
268 
269                 String srcJson = doc._getStr(Const.ATH_PROP_SOURCE_JSON, null);
270                 String trgJson = doc._getStr(Const.ATH_PROP_TARGET_JSON, null);
271 
272                 if (srcJson != null && trgJson != null) {
273                   LayeredTextX srcSc = JacksonUtil.fromJson(srcJson, LayeredTextX.class);
274 
275                   LayeredTextX trgSc = JacksonUtil.fromJson(trgJson, LayeredTextX.class);
276 
277                   if (srcSc == null || trgSc == null) {
278                     Log.warn(this.getClass(), "Broken Solr document, id='{}'",
279                         doc._getStr(Const.ATH_PROP_SOLR_ID, null));
280 
281                   } else {
282                     TextFragment tmSrcTf = ConversionUtil.toTextFragment(srcSc);
283                     TextFragment tmTrgTf = ConversionUtil.toTextFragment(trgSc);
284 
285                     // Apply penalties
286                     // XXX If another match has equal codes with the query, but a lower text
287                     // matching score, it is still a lower match than the one with the text closer
288                     // to the query, but probably worse codes. We give a higher priority to the
289                     // text, not the codes.
290                     if (params.getPenalizeSourceTagsDifference()) {
291                       if (!queryCodes.equals(tmSrcTf.getCodes().toString())) {
292                         score--; // 1% penalty
293                       }
294                     }
295 
296                     if (params.getPenalizeTargetTagsDifference()) {
297                       if (!queryCodes.equals(tmTrgTf.getCodes().toString())) {
298                         score--; // 1% penalty
299                       }
300                     }
301 
302                     if (score < threshold) {
303                       Log.trace(this.getClass(), "\n--- {}%\n--seg: {}\n---tm: {}", score,
304                           queryLt.getText(),
305                           tmSrcTf.getText());
306 
307                       // Drop the hit
308                       continue;
309                     }
310 
311                     QueryResult qr = new QueryResult();
312                     results.add(qr);
313 
314                     qr.weight = super.getWeight();
315 
316                     if (SolrUtil.checkTmFuzzyScore(score)) {
317                       qr.setFuzzyScore(score);
318                     }
319 
320                     qr.entryId = doc._getStr(Const.ATH_PROP_SOLR_ID, null);
321                     qr.source = tmSrcTf;
322                     qr.target = tmTrgTf;
323                     qr.matchType = score >= 100 ? MatchType.EXACT : MatchType.FUZZY;
324                     qr.origin = getName();
325 
326                     Log.trace(this.getClass(), "\n--- {}%\n--seg: {}\n---tm: {}",
327                         qr.getFuzzyScore(),
328                         queryLt.getText(),
329                         tmSrcTf.getText());
330                   }
331                 }
332               }
333             }
334           }
335 
336           // Sort QueryResults on MatchType, Score, and Source String
337           Collections.sort(results);
338         }
339 
340         // P.e(Const.PROFILER_ZONE_TM_SOLR_FUZZY);
341         // P.e(17);
342 
343       } catch (Exception e) {
344         AthRuntimeException.logAndThrow(SolrTmConnector.class, e);
345       }
346 
347       if (results.size() > 0) {
348         current = 0;
349       }
350 
351       return results.size();
352 
353     } finally {
354       // P.e(Const.PROFILER_ZONE_TM_CONNECTOR);
355     }
356   }
357 
358   @Override
359   public void setMaximumHits(int maxHits) {
360     if (maxHits < 1) {
361       this.maxHits = 1;
362 
363     } else {
364       this.maxHits = maxHits;
365     }
366   }
367 
368   @Override
369   public int getMaximumHits() {
370     return maxHits;
371   }
372 
373   @Override
374   public int getThreshold() {
375     return threshold;
376   }
377 
378   @Override
379   public void setThreshold(int threshold) {
380     this.threshold = threshold;
381   }
382 
383   @Override
384   public IParameters getParameters() {
385     return params;
386   }
387 
388   @Override
389   public void setParameters(IParameters params) {
390     this.params = (Parameters) params;
391   }
392 }