View Javadoc
1   package com.acumenvelocity.ath.common;
2   
3   import java.util.ArrayList;
4   import java.util.HashMap;
5   import java.util.List;
6   import java.util.Map;
7   import java.util.UUID;
8   
9   import org.apache.solr.client.solrj.SolrClient;
10  import org.apache.solr.client.solrj.SolrQuery;
11  import org.apache.solr.client.solrj.response.QueryResponse;
12  import org.apache.solr.common.SolrDocument;
13  import org.apache.solr.common.SolrDocumentList;
14  import org.apache.solr.common.SolrInputDocument;
15  
16  import com.acumenvelocity.ath.common.exception.AthException;
17  import com.acumenvelocity.ath.solr.AthIndex;
18  import com.acumenvelocity.ath.solr.Solr;
19  
20  import net.sf.okapi.common.Base64Util;
21  import net.sf.okapi.common.Util;
22  import net.sf.okapi.common.resource.ITextUnit;
23  
24  public class SolrUtil {
25  
26    private static final int MAX_QUERY_LENGTH = Integer.MAX_VALUE;
27  
28    /**
29     * Converts a SolrDocument to a SolrInputDocument for reindexing or updates.
30     * 
31     * <p>
32     * This method creates a new SolrInputDocument by copying all user-defined fields from the
33     * source SolrDocument. Internal Solr fields (those starting with underscore) are automatically
34     * excluded to allow Solr to manage its own metadata fields like {@code _version_},
35     * {@code _root_},
36     * and {@code _nest_path_}.
37     * 
38     * <p>
39     * This is commonly used when:
40     * <ul>
41     * <li>Reading documents from Solr, modifying them, and writing them back</li>
42     * <li>Copying documents between collections</li>
43     * <li>Performing batch updates on existing documents</li>
44     * </ul>
45     * 
46     * <p>
47     * <strong>Example Usage:</strong>
48     * 
49     * <pre>
50     * SolrDocument doc = // ... retrieved from Solr query
51     * SolrInputDocument inputDoc = toInputDocument(doc);
52     * inputDoc.setField("status", "updated");
53     * solr.add(inputDoc);
54     * </pre>
55     * 
56     * <p>
57     * <strong>Note:</strong> This method uses {@code setField()} rather than {@code addField()}
58     * to prevent duplicate field values, ensuring each field appears only once in the resulting
59     * document.
60     * 
61     * @param solrDoc the source SolrDocument to convert; must not be null
62     * @return a new SolrInputDocument containing all non-internal fields from the source document
63     * @throws NullPointerException if solrDoc is null
64     */
65    public static SolrInputDocument toInputDocument(SolrDocument solrDoc) {
66      SolrInputDocument inputDoc = new SolrInputDocument();
67  
68      for (String fieldName : solrDoc.getFieldNames()) {
69        // Skip internal Solr fields
70        if (fieldName.startsWith("_")) {
71          continue;
72        }
73        
74        // Use setField instead of addField to prevent duplicates
75        inputDoc.setField(fieldName, solrDoc.getFieldValue(fieldName));
76      }
77  
78      return inputDoc;
79    }
80  
81    /**
82     * Fetches a SolrInputDocument by its unique ID.
83     *
84     * @param solrClient The SolrClient instance.
85     * @param coreName   The Solr core or collection name.
86     * @param id         The unique document ID.
87     * @return The SolrInputDocument, or null if not found.
88     * @throws Exception if Solr query fails.
89     */
90    public static SolrInputDocument getDocumentBySolrId(SolrClient solrClient, String coreName,
91        String id) throws Exception {
92  
93      SolrQuery query = new SolrQuery("id:\"" + id + "\"");
94      query.setRows(1);
95  
96      QueryResponse response = solrClient.query(coreName, query);
97      SolrDocumentList docs = response.getResults();
98  
99      if (docs == null || docs.isEmpty()) {
100       return null;
101     }
102 
103     SolrDocument solrDoc = docs.get(0);
104     SolrInputDocument inputDoc = new SolrInputDocument();
105 
106     for (String fieldName : solrDoc.getFieldNames()) {
107       // Skip internal Solr fields
108       if (fieldName.startsWith("_")) {
109         continue;
110       }
111       // Use setField instead of addField to prevent duplicates
112       inputDoc.setField(fieldName, solrDoc.getFieldValue(fieldName));
113     }
114 
115     return inputDoc;
116   }
117 
118   public static SolrDocument getDocumentByDocId(UUID docId) {
119     String query = Log.format("docId:\"{}\"", docId);
120 
121     try {
122       QueryResponse response = AthIndex.getMany(Const.SOLR_CORE_ATH_DOCS, query, null,
123           QueryResponse.class);
124 
125       if (response.getResults().isEmpty()) {
126         return null;
127       }
128 
129       return response.getResults().get(0);
130 
131     } catch (Exception e) {
132       // Do nothing
133     }
134 
135     return null;
136   }
137 
138   public static SolrDocument getDocumentSegment(UUID docId, UUID docSegId) {
139     String query = Log.format("docId:\"{}\" AND docSegId:\"{}\"", docId, docSegId);
140 
141     try {
142       QueryResponse response = AthIndex.getMany(Const.SOLR_CORE_ATH_DOC_SEGMENTS, query, null,
143           QueryResponse.class);
144 
145       SolrDocumentList docs = response.getResults();
146 
147       if (docs == null || docs.isEmpty()) {
148         return null;
149       }
150 
151       return docs.get(0);
152 
153     } catch (Exception e) {
154       Log.warn(SolrUtil.class, "Error finding a document segment: " + e.getMessage());
155     }
156 
157     return null;
158   }
159   
160   public static SolrDocument getTmByTmId(UUID tmId) {
161     String query = Log.format("tmId:\"{}\"", tmId);
162 
163     try {
164       QueryResponse response = AthIndex.getMany(Const.SOLR_CORE_ATH_TMS, query, null,
165           QueryResponse.class);
166 
167       if (response.getResults().isEmpty()) {
168         return null;
169       }
170 
171       return response.getResults().get(0);
172 
173     } catch (Exception e) {
174       // Do nothing
175     }
176 
177     return null;
178   }
179 
180   /**
181    * Shifts document segments at or below a specified position down by one position.
182    * 
183    * <p>
184    * This method is used to make room for inserting a new document segment at a specific position
185    * by incrementing the position of all existing segments at or after that position. The method
186    * handles both normal position updates and edge cases where position data may be missing or
187    * invalid.
188    * 
189    * <p>
190    * <strong>Normal Operation:</strong>
191    * <ul>
192    * <li>Queries Solr for all document segments matching the given docId with position >= the
193    * specified position</li>
194    * <li>Increments each matching document's position by 1</li>
195    * <li>Recalculates the Solr ID for each document based on the new position using
196    * {@link SolrUtil#buildDocSegSolrId(UUID, Long)}</li>
197    * <li>Writes all modified documents back to Solr in a batch operation, overwriting the old
198    * documents</li>
199    * </ul>
200    * 
201    * <p>
202    * <strong>Edge Case Handling - Missing Position Data:</strong>
203    * <br>
204    * When a document's position field is missing, null, or invalid (defaults to 0), incrementing by
205    * 1
206    * results in a new position of 1. This creates a potential conflict with existing documents
207    * already at
208    * position 1 or above. To handle this scenario:
209    * <ul>
210    * <li>The method detects when any documents will be moved to position 1</li>
211    * <li>Executes an additional query to find all existing documents at position >= 1 that weren't
212    * already included in the initial query</li>
213    * <li>Shifts these additional documents down by incrementing their positions as well</li>
214    * <li>Ensures no position conflicts occur when documents with failed position reads are inserted
215    * at the head</li>
216    * </ul>
217    * 
218    * <p>
219    * <strong>Example Usage:</strong>
220    * 
221    * <pre>
222    * // Inserting a new segment at position 3 - all segments at positions 3, 4, 5, etc.
223    * // will be moved to positions 4, 5, 6, etc.
224    * moveDocSegmentsBelow(docId, 3L);
225    * </pre>
226    * 
227    * <p>
228    * <strong>Thread Safety and Distributed Locking:</strong> This method is thread-safe across
229    * multiple
230    * worker nodes in a distributed system. It uses Solr-based distributed locking to ensure that
231    * only one
232    * worker node can modify segments for a given docId at a time. The method will retry up to 3
233    * times with
234    * exponential backoff if the lock is already held by another node. Concurrent operations on
235    * different
236    * documents can proceed in parallel without blocking each other.
237    * 
238    * <p>
239    * <strong>Error Handling:</strong> All exceptions are silently caught and ignored. The method
240    * follows a fail-silent pattern and will not throw exceptions to the caller. Locks are always
241    * released
242    * in a finally block to prevent deadlocks.
243    * 
244    * <p>
245    * <strong>Internal Fields:</strong> The Solr internal field {@code _version_} is explicitly
246    * excluded from document copying to allow Solr to manage its own versioning for optimistic
247    * concurrency control.
248    * 
249    * @param docId    the UUID of the document whose segments should be shifted; must not be null
250    * @param position the position threshold; all segments at this position or higher will be shifted
251    *                 down
252    *                 (position incremented by 1); must not be null
253    * 
254    * @see SolrUtil#buildDocSegSolrId(UUID, Long)
255    * @see AthIndex#getMany(String, String, Map, Class)
256    * @see AthIndex#createMany(String, List)
257    */
258   public static void moveDocSegmentsBelow(UUID docId, Long position) {
259     int maxRetries = 3;
260     int retryCount = 0;
261 
262     while (retryCount < maxRetries) {
263       try {
264         // Try to acquire lock on the parent document
265         if (!acquireLock(docId)) {
266           // Lock already held by another node, wait and retry
267           Thread.sleep(100 * (retryCount + 1)); // exponential backoff
268           retryCount++;
269           continue;
270         }
271 
272         try {
273           // Perform the actual repositioning work
274           performRepositioning(docId, position);
275 
276         } finally {
277           // Always release the lock
278           releaseLock(docId);
279         }
280 
281         // Success, exit retry loop
282         break;
283 
284       } catch (Exception e) {
285         // Do nothing
286         break;
287       }
288     }
289   }
290 
291   /**
292    * Attempts to acquire a distributed lock on a document using Solr's atomic update capabilities.
293    * 
294    * <p>
295    * This method uses optimistic locking to ensure only one worker node can hold the lock at a time
296    * across a distributed system. The lock is implemented as a timestamp field that records when the
297    * lock was acquired.
298    * 
299    * <p>
300    * The method attempts to atomically update the document's lock field only if it's currently null
301    * or expired (older than a timeout threshold). This prevents race conditions where multiple nodes
302    * try to acquire the lock simultaneously.
303    * 
304    * @param docId the UUID of the document to lock; must not be null
305    * @return true if the lock was successfully acquired, false if another node holds the lock
306    * @throws Exception if the Solr operation fails
307    */
308   private static boolean acquireLock(UUID docId) throws Exception {
309     long currentTime = System.currentTimeMillis();
310     long expiredBefore = currentTime - Const.SOLR_DOC_LOCK_TIMEOUT_MS;
311 
312     // Query to check current lock status
313     String query = Log.format("id:\"{}\"", docId);
314     QueryResponse response = AthIndex.getMany(Const.SOLR_CORE_ATH_DOCS, query, null,
315         QueryResponse.class);
316 
317     if (response.getResults().isEmpty()) {
318       return false; // Document doesn't exist
319     }
320 
321     SolrDocument doc = response.getResults().get(0);
322     Long lockTimestamp = safeGetLongField(doc, "lockTimestamp", null);
323     Long version = (Long) doc.getFieldValue("_version_");
324 
325     // Check if lock is available (null or expired)
326     if (lockTimestamp != null && lockTimestamp > expiredBefore) {
327       return false; // Lock is held by another node
328     }
329 
330     // Try to acquire the lock using atomic update with version check
331     Map<String, Object> updateDoc = new HashMap<>();
332     updateDoc.put("id", docId.toString());
333     updateDoc.put("lockTimestamp", Map.of("set", currentTime));
334     updateDoc.put("_version_", version); // Optimistic locking
335 
336     try {
337       AthIndex.createOne(Const.SOLR_CORE_ATH_DOCS, updateDoc);
338       return true; // Lock acquired successfully
339     } catch (Exception e) {
340       // Version conflict - another node acquired the lock first
341       return false;
342     }
343   }
344 
345   /**
346    * Releases a distributed lock on a document by clearing the lock timestamp field.
347    * 
348    * <p>
349    * This method should always be called in a finally block to ensure locks are released
350    * even if an exception occurs during processing.
351    * 
352    * @param docId the UUID of the document to unlock; must not be null
353    * @throws Exception if the Solr operation fails
354    */
355   private static void releaseLock(UUID docId) throws Exception {
356     Map<String, Object> updateDoc = new HashMap<>();
357     updateDoc.put("id", docId.toString());
358     updateDoc.put("lockTimestamp", Map.of("set", null)); // Clear the lock
359 
360     AthIndex.createOne(Const.SOLR_CORE_ATH_DOCS, updateDoc);
361   }
362 
363   /**
364    * Performs the actual repositioning of document segments.
365    * 
366    * <p>
367    * This is the core logic extracted from the original method, to be executed
368    * while holding the distributed lock.
369    * 
370    * @param docId    the UUID of the document whose segments should be shifted
371    * @param position the position threshold for shifting segments
372    * @throws Exception if any Solr operation fails
373    */
374   private static void performRepositioning(UUID docId, Long position) throws Exception {
375     // Query for documents with docId matching AND position >= the given position
376     String query = Log.format("docId:\"{}\" AND position:[{} TO *]", docId, position);
377 
378     QueryResponse response = AthIndex.getMany(Const.SOLR_CORE_ATH_DOC_SEGMENTS, query, null,
379         QueryResponse.class);
380 
381     if (response.getResults().isEmpty()) {
382       return;
383     }
384 
385     // Prepare list to hold modified documents
386     List<Map<String, Object>> updatedDocs = new ArrayList<>();
387 
388     // Track if we have any documents that will be moved to position 1
389     boolean hasDocsMovingToPositionOne = false;
390 
391     // Process each document
392     for (SolrDocument solrDoc : response.getResults()) {
393       // Create a map for the updated document
394       Map<String, Object> doc = new HashMap<>();
395 
396       // Copy all existing fields (except _version_)
397       for (String fieldName : solrDoc.getFieldNames()) {
398         if (!"_version_".equals(fieldName)) {
399           doc.put(fieldName, solrDoc.getFieldValue(fieldName));
400         }
401       }
402 
403       // Get current position and increment by 1
404       Long currentPosition = safeGetLongField(solrDoc, "position", 0L);
405       Long newPosition = currentPosition + 1; // docs with no position are inserted at the head
406 
407       if (newPosition == 1L) {
408         hasDocsMovingToPositionOne = true;
409       }
410 
411       // Update position field
412       doc.put("position", newPosition);
413 
414       // Update the Solr ID based on new position
415       String newId = SolrUtil.buildDocSegSolrId(docId, newPosition);
416       doc.put(Const.ATH_PROP_SOLR_ID, newId);
417 
418       updatedDocs.add(doc);
419     }
420 
421     // If documents are being moved to position 1, we need to shift existing docs at position 1 and
422     // above
423     if (hasDocsMovingToPositionOne) {
424       // Query for all documents at position >= 1 (which weren't already included)
425       String shiftQuery = Log.format("docId:\"{}\" AND position:[1 TO *]", docId);
426       QueryResponse shiftResponse = AthIndex.getMany(Const.SOLR_CORE_ATH_DOC_SEGMENTS, shiftQuery,
427           null,
428           QueryResponse.class);
429 
430       for (SolrDocument solrDoc : shiftResponse.getResults()) {
431         Long existingPosition = safeGetLongField(solrDoc, "position", 0L);
432 
433         // Skip if this document was already processed in the first query
434         if (existingPosition >= position) {
435           continue;
436         }
437 
438         // Create a map for the updated document
439         Map<String, Object> doc = new HashMap<>();
440 
441         // Copy all existing fields (except _version_)
442         for (String fieldName : solrDoc.getFieldNames()) {
443           if (!"_version_".equals(fieldName)) {
444             doc.put(fieldName, solrDoc.getFieldValue(fieldName));
445           }
446         }
447 
448         // Increment position by 1
449         Long newPosition = existingPosition + 1;
450         doc.put("position", newPosition);
451 
452         // Update the Solr ID based on new position
453         String newId = SolrUtil.buildDocSegSolrId(docId, newPosition);
454         doc.put(Const.ATH_PROP_SOLR_ID, newId);
455 
456         updatedDocs.add(doc);
457       }
458     }
459 
460     // Write all modified documents back to Solr
461     if (!updatedDocs.isEmpty()) {
462       AthIndex.createMany(Const.SOLR_CORE_ATH_DOC_SEGMENTS, updatedDocs);
463     }
464   }
465 
466   public static boolean checkTmFuzzyScore(int score) {
467     return score >= 0 && score <= 101;
468   }
469 
470   /**
471    * @see org.apache.solr.client.solrj.util.ClientUtils
472    * @see https://stackoverflow.com/questions/44708872/why-does-solr-clientutilsescapequerychars-escape-spaces
473    * @return
474    */
475   public static String escapeQueryCharsNoWs(String s) {
476     StringBuilder sb = new StringBuilder();
477     for (int i = 0; i < s.length(); i++) {
478       char c = s.charAt(i);
479       // These characters are part of the query syntax and must be escaped
480       if (c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')'
481           || c == ':' || c == '^'
482           || c == '[' || c == ']' || c == '\"' || c == '{' || c == '}'
483           || c == '~' || c == '*' || c == '?'
484           || c == '|' || c == '&' || c == ';' || c == '/') {
485         sb.append('\\');
486       }
487       sb.append(c);
488     }
489     return sb.toString();
490   }
491 
492   public static String normalizeQuery(String query) {
493     return normalizeQuery(query, MAX_QUERY_LENGTH);
494   }
495 
496   public static String normalizeQuery(String query, int maxLen) {
497     if (Util.isEmpty(query)) {
498       return query;
499     }
500 
501     // String st = ClientUtils.escapeQueryChars(query);
502     String st = escapeQueryCharsNoWs(query);
503 
504     if (maxLen < 1) {
505       maxLen = 1;
506     }
507 
508     if (st.length() > maxLen) {
509       st = st.substring(0, st.charAt(maxLen - 1) == '\\' ? maxLen - 1 : maxLen);
510       Log.warn(Solr.class,
511           "Query string is longer than {} chars, truncated to '{}'", maxLen,
512           st);
513     }
514 
515     return st;
516   }
517 
518   public static long getNumDocuments() {
519     return getNumDocuments(Const.SOLR_CORE_ATH_TM_SEGMENTS);
520   }
521 
522   public static long getNumDocuments(String coreName) {
523     return getNumDocuments(coreName, "*:*");
524   }
525 
526   public static long getNumDocuments(String coreName, String query) {
527     try {
528       QueryResponse response = AthIndex.getMany(coreName, query, null,
529           QueryResponse.class);
530 
531       SolrDocumentList docList = response.getResults();
532       return docList.getNumFound();
533 
534     } catch (Exception e) {
535       // TODO Log error
536       return -1;
537     }
538   }
539 
540   public static void safeAddField(ITextUnit tu, SolrInputDocument doc, String name, String value)
541       throws AthException {
542 
543     safeAddField(tu, doc, name, value, true);
544   }
545 
546   public static void safeAddField(SolrInputDocument doc, String name, String value) {
547     if (!Util.isEmpty(value)) {
548       doc.addField(name, value);
549     }
550   }
551 
552   public static void safeAddField(SolrInputDocument doc, String name, UUID value) {
553     if (value != null) {
554       doc.addField(name, value.toString());
555     }
556   }
557 
558   public static void safeAddField(ITextUnit tu, SolrInputDocument doc, String name, String value,
559       boolean strictValueCheck) throws AthException {
560 
561     if (doc == null) {
562       AthException.logAndThrow(Solr.class, "Doc is null for TU:\n{}", tu.getId());
563     }
564 
565     if (Util.isEmpty(name)) {
566       AthException.logAndThrow(Solr.class, "Null or empty name for TU:\n{}", tu.getId());
567     }
568 
569     if (Util.isEmpty(value) || Util.isEmpty(value.trim())) {
570       if (strictValueCheck) {
571         AthException.logAndThrow(Solr.class, "Null or empty value of the '{}' field for TU:\n{}",
572             name, tu.getId());
573 
574       } else {
575         // Silently quit w/o adding the field
576         return;
577       }
578     }
579 
580     doc.addField(name, value);
581   }
582 
583   public static void safeSetField(ITextUnit tu, SolrInputDocument doc, String name, String value)
584       throws AthException {
585 
586     safeSetField(tu, doc, name, value, true);
587   }
588 
589   public static void safeSetField(ITextUnit tu, SolrInputDocument doc, String name, String value,
590       boolean strictValueCheck) throws AthException {
591 
592     if (doc == null) {
593       AthException.logAndThrow(Solr.class, "Doc is null for TU:\n{}", tu.getId());
594     }
595 
596     if (Util.isEmpty(name)) {
597       AthException.logAndThrow(Solr.class, "Null or empty name for TU:\n{}", tu.getId());
598     }
599 
600     if (Util.isEmpty(value) || Util.isEmpty(value.trim())) {
601       if (strictValueCheck) {
602         AthException.logAndThrow(Solr.class, "Null or empty value of the '{}' field for TU:\n{}",
603             name, tu.getId());
604 
605       } else {
606         // Silently quit w/o adding the field
607         return;
608       }
609     }
610 
611     doc.setField(name, value);
612   }
613 
614   public static void safeSetField(SolrInputDocument doc, String name, String value) {
615     if (!Util.isEmpty(value)) {
616       doc.setField(name, value);
617     }
618   }
619   
620   public static void safeSetField(SolrInputDocument doc, String name, UUID value) {
621     if (value != null) {
622       doc.setField(name, value);
623     }
624   }
625 
626   /**
627    * Safely gets a field value from a SolrDocument and converts to String.
628    * Returns defVal if field is null or empty.
629    */
630   public static String safeGetField(SolrDocument doc, String fieldName, String defVal) {
631     Object val = doc == null ? null : doc.getFieldValue(fieldName);
632     return val != null ? val.toString() : defVal;
633   }
634 
635   public static Long safeGetLongField(SolrDocument doc, String fieldName, Long defVal) {
636     Object val = doc == null ? null : doc.getFieldValue(fieldName);
637 
638     if (val == null) {
639       return defVal;
640     }
641 
642     try {
643       if (val instanceof Long) {
644         return (Long) val;
645 
646       } else if (val instanceof Number) {
647         return ((Number) val).longValue();
648 
649       } else {
650         // Assume string representation of a number (e.g., "1", "10")
651         return Long.parseLong(val.toString());
652       }
653 
654     } catch (NumberFormatException e) {
655       return defVal;
656     }
657   }
658   
659   public static Integer safeGetIntField(SolrDocument doc, String fieldName, Integer defVal) {
660     Object val = doc == null ? null : doc.getFieldValue(fieldName);
661 
662     if (val == null) {
663       return defVal;
664     }
665 
666     try {
667       if (val instanceof Integer) {
668         return (Integer) val;
669 
670       } else if (val instanceof Number) {
671         return ((Number) val).intValue();
672 
673       } else {
674         // Assume string representation of a number (e.g., "1", "10")
675         return Integer.parseInt(val.toString());
676       }
677 
678     } catch (NumberFormatException e) {
679       return defVal;
680     }
681   }
682 
683   public static String buildDocSegSolrId(UUID docId, Long position) {
684     return Base64Util.encodeString(Log.format("{}-{}", docId.toString(), position));
685   }
686 
687   public static String buildTmSegSolrId(UUID tmId, String sourceWithCodes) {
688     return Base64Util.encodeString(Log.format("{}-{}", tmId.toString(), sourceWithCodes));
689   }
690 }