View Javadoc
1   package com.acumenvelocity.ath.filters.pdf;
2   
3   import com.acumenvelocity.ath.model.OcrMode;
4   
5   import net.sf.okapi.common.StringParameters;
6   
7   /**
8    * Filter parameters for the PDF filter (with OCR and DOCX-related extraction options).
9    *
10   * <p>
11   * Extends Okapi-style parameter management. Includes options similar to
12   * {@code ConditionalParameters} from the OpenXML filter that are relevant to
13   * DOCX or general Office file content.
14   * </p>
15   */
16  public class Parameters extends StringParameters {
17  
18    // === OCR handling ===
19    public static final String PROP_OCR_MODE = "ocrMode";
20  
21    // === DOCX / General Office parameters (preserving Okapi constants) ===
22    private static final String MAX_ATTRIBUTE_SIZE = "maxAttributeSize";
23    private static final String TRANSLATEDOCPROPERTIES = "bPreferenceTranslateDocProperties";
24    private static final String TRANSLATECOMMENTS = "bPreferenceTranslateComments";
25    private static final String AGGRESSIVECLEANUP = "bPreferenceAggressiveCleanup";
26    private static final String AUTOMATICALLY_ACCEPT_REVISIONS = "bPreferenceAutomaticallyAcceptRevisions";
27    private static final String TRANSLATEWORDHEADERSFOOTERS = "bPreferenceTranslateWordHeadersFooters";
28    private static final String TRANSLATE_WORD_NUMBERING_LEVEL_TEXT = "translateWordNumberingLevelText";
29    private static final String TRANSLATE_WORD_HIDDEN = "bPreferenceTranslateWordHidden";
30    private static final String TRANSLATEWORDEXCLUDEGRAPHICMETADATA = "bPreferenceTranslateWordExcludeGraphicMetaData";
31    private static final String ADDTABASCHARACTER = "bPreferenceAddTabAsCharacter";
32    private static final String ADDLINESEPARATORASCHARACTER = "bPreferenceAddLineSeparatorAsCharacter";
33    private static final String LINESEPARATORREPLACEMENT = "sPreferenceLineSeparatorReplacement";
34    private static final String REPLACE_NO_BREAK_HYPHEN_TAG = "bPreferenceReplaceNoBreakHyphenTag";
35    private static final String IGNORE_SOFT_HYPHEN_TAG = "bPreferenceIgnoreSoftHyphenTag";
36    private static final String TRANSLATE_WORD_EXCLUDE_STYLE_MODE = "bInExcludeMode";
37    private static final String TRANSLATE_WORD_EXCLUDE_HIGHLIGHT_MODE = "bInExcludeHighlightMode";
38    private static final String TRANSLATE_WORD_EXCLUDE_COLORS = "bPreferenceTranslateWordExcludeColors";
39    private static final String IGNORE_WORD_FONT_COLORS = "ignoreWordFontColors";
40    private static final String WORD_FONT_COLORS_MIN_IGNORANCE_THRESHOLD = "wordFontColorsMinIgnoranceThreshold";
41    private static final String WORD_FONT_COLORS_MAX_IGNORANCE_THRESHOLD = "wordFontColorsMaxIgnoranceThreshold";
42    private static final String ALLOW_WORD_STYLE_OPTIMISATION = "allowWordStyleOptimisation";
43    private static final String ALLOW_EMPTY_TARGETS = "bPreferenceAllowEmptyTargets";
44  
45    public Parameters() {
46      super();
47      reset();
48    }
49  
50    @Override
51    public void reset() {
52      super.reset();
53      setOcrMode(OcrMode.AUTO);
54  
55      // Defaults following Okapi OpenXML behavior
56      setMaxAttributeSize(4 * 1024 * 1024);
57      setTranslateDocProperties(false);
58      setTranslateComments(true);
59      setCleanupAggressively(false);
60      setAutomaticallyAcceptRevisions(true);
61      setTranslateWordHeadersFooters(true);
62      setTranslateWordNumberingLevelText(false);
63      setTranslateWordHidden(false);
64      setTranslateWordExcludeGraphicMetaData(true);
65      setAddTabAsCharacter(false);
66      setAddLineSeparatorCharacter(false);
67      setLineSeparatorReplacement('\n');
68      setReplaceNoBreakHyphenTag(false);
69      setIgnoreSoftHyphenTag(false);
70      setTranslateWordInExcludeStyleMode(true);
71      setTranslateWordInExcludeHighlightMode(true);
72      setTranslateWordExcludeColors(false);
73      setIgnoreWordFontColors(false);
74      setAllowWordStyleOptimisation(true);
75      setAllowEmptyTargets(false);
76    }
77  
78    // === OCR mode ===
79    public OcrMode getOcrMode() {
80      return OcrMode.fromValue(getString(PROP_OCR_MODE));
81    }
82  
83    public void setOcrMode(OcrMode mode) {
84      setString(PROP_OCR_MODE, mode != null ? mode.name() : OcrMode.AUTO.name());
85    }
86  
87    // === DOCX/general getters/setters ===
88    public int getMaxAttributeSize() {
89      return getInteger(MAX_ATTRIBUTE_SIZE);
90    }
91  
92    public void setMaxAttributeSize(int value) {
93      setInteger(MAX_ATTRIBUTE_SIZE, value);
94    }
95  
96    public boolean getTranslateDocProperties() {
97      return getBoolean(TRANSLATEDOCPROPERTIES);
98    }
99  
100   public void setTranslateDocProperties(boolean value) {
101     setBoolean(TRANSLATEDOCPROPERTIES, value);
102   }
103 
104   public boolean getTranslateComments() {
105     return getBoolean(TRANSLATECOMMENTS);
106   }
107 
108   public void setTranslateComments(boolean value) {
109     setBoolean(TRANSLATECOMMENTS, value);
110   }
111 
112   public boolean getCleanupAggressively() {
113     return getBoolean(AGGRESSIVECLEANUP);
114   }
115 
116   public void setCleanupAggressively(boolean value) {
117     setBoolean(AGGRESSIVECLEANUP, value);
118   }
119 
120   public boolean getAutomaticallyAcceptRevisions() {
121     return getBoolean(AUTOMATICALLY_ACCEPT_REVISIONS);
122   }
123 
124   public void setAutomaticallyAcceptRevisions(boolean value) {
125     setBoolean(AUTOMATICALLY_ACCEPT_REVISIONS, value);
126   }
127 
128   public boolean getTranslateWordHeadersFooters() {
129     return getBoolean(TRANSLATEWORDHEADERSFOOTERS);
130   }
131 
132   public void setTranslateWordHeadersFooters(boolean value) {
133     setBoolean(TRANSLATEWORDHEADERSFOOTERS, value);
134   }
135 
136   public boolean getTranslateWordNumberingLevelText() {
137     return getBoolean(TRANSLATE_WORD_NUMBERING_LEVEL_TEXT);
138   }
139 
140   public void setTranslateWordNumberingLevelText(boolean value) {
141     setBoolean(TRANSLATE_WORD_NUMBERING_LEVEL_TEXT, value);
142   }
143 
144   public boolean getTranslateWordHidden() {
145     return getBoolean(TRANSLATE_WORD_HIDDEN);
146   }
147 
148   public void setTranslateWordHidden(boolean value) {
149     setBoolean(TRANSLATE_WORD_HIDDEN, value);
150   }
151 
152   public boolean getTranslateWordExcludeGraphicMetaData() {
153     return getBoolean(TRANSLATEWORDEXCLUDEGRAPHICMETADATA);
154   }
155 
156   public void setTranslateWordExcludeGraphicMetaData(boolean value) {
157     setBoolean(TRANSLATEWORDEXCLUDEGRAPHICMETADATA, value);
158   }
159 
160   public boolean getAddTabAsCharacter() {
161     return getBoolean(ADDTABASCHARACTER);
162   }
163 
164   public void setAddTabAsCharacter(boolean value) {
165     setBoolean(ADDTABASCHARACTER, value);
166   }
167 
168   public boolean getAddLineSeparatorCharacter() {
169     return getBoolean(ADDLINESEPARATORASCHARACTER);
170   }
171 
172   public void setAddLineSeparatorCharacter(boolean value) {
173     setBoolean(ADDLINESEPARATORASCHARACTER, value);
174   }
175 
176   public char getLineSeparatorReplacement() {
177     String s = getString(LINESEPARATORREPLACEMENT);
178     return (s == null || s.isEmpty()) ? '\n' : s.charAt(0);
179   }
180 
181   public void setLineSeparatorReplacement(char value) {
182     setString(LINESEPARATORREPLACEMENT, String.valueOf(value));
183   }
184 
185   public boolean getReplaceNoBreakHyphenTag() {
186     return getBoolean(REPLACE_NO_BREAK_HYPHEN_TAG);
187   }
188 
189   public void setReplaceNoBreakHyphenTag(boolean value) {
190     setBoolean(REPLACE_NO_BREAK_HYPHEN_TAG, value);
191   }
192 
193   public boolean getIgnoreSoftHyphenTag() {
194     return getBoolean(IGNORE_SOFT_HYPHEN_TAG);
195   }
196 
197   public void setIgnoreSoftHyphenTag(boolean value) {
198     setBoolean(IGNORE_SOFT_HYPHEN_TAG, value);
199   }
200 
201   public boolean getTranslateWordInExcludeStyleMode() {
202     return getBoolean(TRANSLATE_WORD_EXCLUDE_STYLE_MODE);
203   }
204 
205   public void setTranslateWordInExcludeStyleMode(boolean value) {
206     setBoolean(TRANSLATE_WORD_EXCLUDE_STYLE_MODE, value);
207   }
208 
209   public boolean getTranslateWordInExcludeHighlightMode() {
210     return getBoolean(TRANSLATE_WORD_EXCLUDE_HIGHLIGHT_MODE);
211   }
212 
213   public void setTranslateWordInExcludeHighlightMode(boolean value) {
214     setBoolean(TRANSLATE_WORD_EXCLUDE_HIGHLIGHT_MODE, value);
215   }
216 
217   public boolean getTranslateWordExcludeColors() {
218     return getBoolean(TRANSLATE_WORD_EXCLUDE_COLORS);
219   }
220 
221   public void setTranslateWordExcludeColors(boolean value) {
222     setBoolean(TRANSLATE_WORD_EXCLUDE_COLORS, value);
223   }
224 
225   public boolean getIgnoreWordFontColors() {
226     return getBoolean(IGNORE_WORD_FONT_COLORS);
227   }
228 
229   public void setIgnoreWordFontColors(boolean value) {
230     setBoolean(IGNORE_WORD_FONT_COLORS, value);
231   }
232 
233   public String getWordFontColorsMinIgnoranceThreshold() {
234     return getString(WORD_FONT_COLORS_MIN_IGNORANCE_THRESHOLD);
235   }
236 
237   public void setWordFontColorsMinIgnoranceThreshold(String value) {
238     setString(WORD_FONT_COLORS_MIN_IGNORANCE_THRESHOLD, value);
239   }
240 
241   public String getWordFontColorsMaxIgnoranceThreshold() {
242     return getString(WORD_FONT_COLORS_MAX_IGNORANCE_THRESHOLD);
243   }
244 
245   public void setWordFontColorsMaxIgnoranceThreshold(String value) {
246     setString(WORD_FONT_COLORS_MAX_IGNORANCE_THRESHOLD, value);
247   }
248 
249   public boolean getAllowWordStyleOptimisation() {
250     return getBoolean(ALLOW_WORD_STYLE_OPTIMISATION);
251   }
252 
253   public void setAllowWordStyleOptimisation(boolean value) {
254     setBoolean(ALLOW_WORD_STYLE_OPTIMISATION, value);
255   }
256 
257   public boolean getAllowEmptyTargets() {
258     return getBoolean(ALLOW_EMPTY_TARGETS);
259   }
260 
261   public void setAllowEmptyTargets(boolean value) {
262     setBoolean(ALLOW_EMPTY_TARGETS, value);
263   }
264 }