lfoppiano commited on
Commit
0a1e029
·
verified ·
1 Parent(s): cc380b1

Create grobid-full.yml

Browse files
Files changed (1) hide show
  1. grobid-full.yml +370 -0
grobid-full.yml ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # this is the configuration file for the GROBID instance that uses the Deep Learning Models.
2
+
3
+ grobid:
4
+ # where all the Grobid resources are stored (models, lexicon, native libraries, etc.), normally no need to change
5
+ grobidHome: "/opt/grobid/grobid-home"
6
+
7
+ # path relative to the grobid-home path (e.g. tmp for grobid-home/tmp) or absolute path (/tmp)
8
+ temp: "tmp"
9
+
10
+ # normally nothing to change here, path relative to the grobid-home path (e.g. grobid-home/lib)
11
+ nativelibrary: "lib"
12
+
13
+ pdf:
14
+ pdfalto:
15
+ # path relative to the grobid-home path (e.g. grobid-home/pdfalto), you don't want to change this normally
16
+ path: "pdfalto"
17
+ # security for PDF parsing
18
+ memoryLimitMb: 6096
19
+ timeoutSec: 120
20
+
21
+ # security relative to the PDF parsing result
22
+ blocksMax: 200000
23
+ tokensMax: 1000000
24
+
25
+ consolidation:
26
+ # define the bibliographical data consolidation service to be used, either "crossref" for CrossRef REST API or
27
+ # "glutton" for https://github.com/kermitt2/biblio-glutton
28
+ # service: "crossref"
29
+ service: "glutton"
30
+ glutton:
31
+ url: "http://sciencialab.ddns.net:8080"
32
+ # url: "http://localhost:8080"
33
+ crossref:
34
+ mailto:
35
+ # to use crossref web API, you need normally to use it politely and to indicate an email address here, e.g.
36
+ #mailto: "toto@titi.tutu"
37
+ token:
38
+ # to use Crossref metadata plus service (available by subscription)
39
+ #token: "yourmysteriouscrossrefmetadataplusauthorizationtokentobeputhere"
40
+
41
+ proxy:
42
+ # proxy to be used when doing external call to the consolidation service
43
+ host:
44
+ port:
45
+
46
+ # CORS configuration for the GROBID web API service
47
+ corsAllowedOrigins: "*"
48
+ corsAllowedMethods: "OPTIONS,GET,PUT,POST,DELETE,HEAD"
49
+ corsAllowedHeaders: "X-Requested-With,Content-Type,Accept,Origin"
50
+
51
+ # the actual implementation for language recognition to be used
52
+ languageDetectorFactory: "org.grobid.core.lang.impl.CybozuLanguageDetectorFactory"
53
+
54
+ # the actual implementation for optional sentence segmentation to be used (PragmaticSegmenter or OpenNLP)
55
+ #sentenceDetectorFactory: "org.grobid.core.lang.impl.PragmaticSentenceDetectorFactory"
56
+ sentenceDetectorFactory: "org.grobid.core.lang.impl.OpenNLPSentenceDetectorFactory"
57
+
58
+ # maximum concurrency allowed to GROBID server for processing parallel requests - change it according to your CPU/GPU capacities
59
+ # for a production server running only GROBID, set the value slightly above the available number of threads of the server
60
+ # to get best performance and security
61
+ concurrency: 10
62
+ # when the pool is full, for queries waiting for the availability of a Grobid engine, this is the maximum time wait to try
63
+ # to get an engine (in seconds) - normally never change it
64
+ poolMaxWait: 1
65
+
66
+ delft:
67
+ # DeLFT global parameters
68
+ # delft installation path if Deep Learning architectures are used to implement one of the sequence labeling model,
69
+ # embeddings are usually compiled as lmdb under delft/data (this parameter is ignored if only featured-engineered CRF are used)
70
+ install: "/opt/delft"
71
+ pythonVirtualEnv:
72
+
73
+ wapiti:
74
+ # Wapiti global parameters
75
+ # number of threads for training the wapiti models (0 to use all available processors)
76
+ nbThreads: 0
77
+
78
+ models:
79
+ # we configure here how each sequence labeling model should be implemented
80
+ # for feature-engineered CRF, use "wapiti" and possible training parameters are window, epsilon and nbMaxIterations
81
+ # for Deep Learning, use "delft" and select the target DL architecture (see DeLFT library), the training
82
+ # parameters then depends on this selected DL architecture
83
+
84
+ - name: "segmentation"
85
+ # at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation
86
+ engine: "wapiti"
87
+ #engine: "delft"
88
+ wapiti:
89
+ # wapiti training parameters, they will be used at training time only
90
+ epsilon: 0.0000001
91
+ window: 50
92
+ nbMaxIterations: 2000
93
+ delft:
94
+ # deep learning parameters
95
+ architecture: "BidLSTM_CRF_FEATURES"
96
+ useELMo: false
97
+ runtime:
98
+ # parameters used at runtime/prediction
99
+ max_sequence_length: 3000
100
+ batch_size: 1
101
+ training:
102
+ # parameters used for training
103
+ max_sequence_length: 3000
104
+ batch_size: 10
105
+
106
+ - name: "segmentation-article-light"
107
+ engine: "wapiti"
108
+ wapiti:
109
+ # wapiti training parameters, they will be used at training time only
110
+ epsilon: 0.0000001
111
+ window: 50
112
+ nbMaxIterations: 2000
113
+
114
+ - name: "segmentation-article-light-ref"
115
+ engine: "wapiti"
116
+ wapiti:
117
+ # wapiti training parameters, they will be used at training time only
118
+ epsilon: 0.0000001
119
+ window: 50
120
+ nbMaxIterations: 2000
121
+
122
+ - name: "fulltext"
123
+ # at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation
124
+ engine: "wapiti"
125
+ wapiti:
126
+ # wapiti training parameters, they will be used at training time only
127
+ epsilon: 0.0001
128
+ window: 20
129
+ nbMaxIterations: 1500
130
+
131
+ - name: "header"
132
+ #engine: "wapiti"
133
+ engine: "delft"
134
+ wapiti:
135
+ # wapiti training parameters, they will be used at training time only
136
+ epsilon: 0.000001
137
+ window: 30
138
+ nbMaxIterations: 1500
139
+ delft:
140
+ # deep learning parameters
141
+ architecture: "BidLSTM_ChainCRF_FEATURES"
142
+ #transformer: "allenai/scibert_scivocab_cased"
143
+ useELMo: false
144
+ runtime:
145
+ # parameters used at runtime/prediction
146
+ #max_sequence_length: 510
147
+ max_sequence_length: 3000
148
+ batch_size: 1
149
+ training:
150
+ # parameters used for training
151
+ #max_sequence_length: 510
152
+ #batch_size: 6
153
+ max_sequence_length: 3000
154
+ batch_size: 9
155
+
156
+ - name: "header-article-light"
157
+ # engine: "wapiti"
158
+ engine: "delft"
159
+ wapiti:
160
+ # wapiti training parameters, they will be used at training time only
161
+ epsilon: 0.000001
162
+ window: 30
163
+ nbMaxIterations: 1500
164
+ delft:
165
+ architecture: "BidLSTM_ChainCRF_FEATURES"
166
+ useELMo: false
167
+
168
+ - name: "header-article-light-ref"
169
+ # engine: "wapiti"
170
+ engine: "delft"
171
+ wapiti:
172
+ # wapiti training parameters, they will be used at training time only
173
+ epsilon: 0.000001
174
+ window: 30
175
+ nbMaxIterations: 1500
176
+ delft:
177
+ architecture: "BidLSTM_ChainCRF_FEATURES"
178
+ useELMo: false
179
+
180
+ - name: "reference-segmenter"
181
+ #engine: "wapiti"
182
+ engine: "delft"
183
+ wapiti:
184
+ # wapiti training parameters, they will be used at training time only
185
+ epsilon: 0.00001
186
+ window: 20
187
+ delft:
188
+ # deep learning parameters
189
+ architecture: "BidLSTM_ChainCRF_FEATURES"
190
+ useELMo: false
191
+ runtime:
192
+ # parameters used at runtime/prediction (for this model, use same max_sequence_length as training)
193
+ max_sequence_length: 3000
194
+ batch_size: 2
195
+ training:
196
+ # parameters used for training
197
+ max_sequence_length: 3000
198
+ batch_size: 10
199
+
200
+ - name: "name-header"
201
+ engine: "wapiti"
202
+ #engine: "delft"
203
+ delft:
204
+ # deep learning parameters
205
+ architecture: "BidLSTM_CRF_FEATURES"
206
+
207
+ - name: "name-citation"
208
+ engine: "wapiti"
209
+ #engine: "delft"
210
+ delft:
211
+ # deep learning parameters
212
+ architecture: "BidLSTM_CRF_FEATURES"
213
+
214
+ - name: "date"
215
+ engine: "wapiti"
216
+ #engine: "delft"
217
+ delft:
218
+ # deep learning parameters
219
+ architecture: "BidLSTM_CRF_FEATURES"
220
+
221
+ - name: "figure"
222
+ engine: "wapiti"
223
+ #engine: "delft"
224
+ wapiti:
225
+ # wapiti training parameters, they will be used at training time only
226
+ epsilon: 0.00001
227
+ window: 20
228
+ delft:
229
+ # deep learning parameters
230
+ architecture: "BidLSTM_CRF"
231
+
232
+ - name: "table"
233
+ engine: "wapiti"
234
+ #engine: "delft"
235
+ wapiti:
236
+ # wapiti training parameters, they will be used at training time only
237
+ epsilon: 0.00001
238
+ window: 20
239
+ delft:
240
+ # deep learning parameters
241
+ architecture: "BidLSTM_CRF"
242
+
243
+ - name: "affiliation-address"
244
+ #engine: "wapiti"
245
+ engine: "delft"
246
+ delft:
247
+ # deep learning parameters
248
+ architecture: "BidLSTM_CRF_FEATURES"
249
+
250
+ - name: "citation"
251
+ #engine: "wapiti"
252
+ engine: "delft"
253
+ wapiti:
254
+ # wapiti training parameters, they will be used at training time only
255
+ epsilon: 0.00001
256
+ window: 50
257
+ nbMaxIterations: 3000
258
+ delft:
259
+ # deep learning parameters
260
+ architecture: "BidLSTM_CRF_FEATURES"
261
+ #architecture: "BERT_CRF"
262
+ #transformer: "michiyasunaga/LinkBERT-base"
263
+ useELMo: false
264
+ runtime:
265
+ # parameters used at runtime/prediction
266
+ max_sequence_length: 500
267
+ batch_size: 30
268
+ training:
269
+ # parameters used for training
270
+ max_sequence_length: 500
271
+ batch_size: 50
272
+
273
+ - name: "patent-citation"
274
+ engine: "wapiti"
275
+ wapiti:
276
+ # wapiti training parameters, they will be used at training time only
277
+ epsilon: 0.0001
278
+ window: 20
279
+ delft:
280
+ # deep learning parameters
281
+ architecture: "BidLSTM_CRF_FEATURES"
282
+ #architecture: "BERT_CRF"
283
+ runtime:
284
+ # parameters used at runtime/prediction
285
+ max_sequence_length: 800
286
+ batch_size: 20
287
+ training:
288
+ # parameters used for training
289
+ max_sequence_length: 1000
290
+ batch_size: 40
291
+
292
+ - name: "funding-acknowledgement"
293
+ #engine: "wapiti"
294
+ engine: "delft"
295
+ wapiti:
296
+ # wapiti training parameters, they will be used at training time only
297
+ epsilon: 0.00001
298
+ window: 50
299
+ nbMaxIterations: 2000
300
+ delft:
301
+ # deep learning parameters
302
+ architecture: "BidLSTM_CRF_FEATURES"
303
+ #architecture: "BERT_CRF"
304
+ #transformer: "michiyasunaga/LinkBERT-base"
305
+ useELMo: false
306
+ runtime:
307
+ # parameters used at runtime/prediction
308
+ max_sequence_length: 800
309
+ batch_size: 20
310
+ training:
311
+ # parameters used for training
312
+ max_sequence_length: 500
313
+ batch_size: 40
314
+
315
+ - name: "copyright"
316
+ # at this time, we only have a DeLFT implementation,
317
+ # use "wapiti" if the deep learning library JNI is not available and model will then be ignored
318
+ # engine: "delft"
319
+ engine: "wapiti"
320
+ delft:
321
+ # deep learning parameters
322
+ architecture: "gru"
323
+ #architecture: "bert"
324
+ #transformer: "allenai/scibert_scivocab_cased"
325
+
326
+ - name: "license"
327
+ # at this time, for being active, it must be DeLFT, no other implementation is available
328
+ # use "wapiti" if the deep learning library JNI is not available and model will then be ignored
329
+ # engine: "delft"
330
+ engine: "wapiti"
331
+ delft:
332
+ # deep learning parameters
333
+ architecture: "gru"
334
+ #architecture: "bert"
335
+ #transformer: "allenai/scibert_scivocab_cased"
336
+
337
+ # for **service only**: how to load the models,
338
+ # false -> models are loaded when needed, avoiding putting in memory useless models (only in case of CRF) but slow down
339
+ # significantly the service at first call
340
+ # true -> all the models are loaded into memory at the server startup (default), slow the start of the services
341
+ # and models not used will take some more memory (only in case of CRF), but server is immediatly warm and ready
342
+ modelPreload: true
343
+
344
+ server:
345
+ type: custom
346
+ applicationConnectors:
347
+ - type: http
348
+ port: 8070
349
+ adminConnectors:
350
+ - type: http
351
+ port: 8071
352
+ registerDefaultExceptionMappers: false
353
+ # change the following for having all http requests logged
354
+ requestLog:
355
+ appenders: []
356
+
357
+ # these logging settings apply to the Grobid service usage mode
358
+ logging:
359
+ level: INFO
360
+ loggers:
361
+ org.apache.pdfbox.pdmodel.font.PDSimpleFont: "OFF"
362
+ org.glassfish.jersey.internal: "OFF"
363
+ com.squarespace.jersey2.guice.JerseyGuiceUtils: "OFF"
364
+ appenders:
365
+ - type: console
366
+ threshold: WARN
367
+ timeZone: UTC
368
+ # uncomment to have the logs in json format
369
+ #layout:
370
+ # type: json