lfoppiano commited on
Commit
3e5af34
·
verified ·
1 Parent(s): 90538e1

Update Dockerfile

Browse files
Files changed (1) hide show
  1. Dockerfile +17 -144
Dockerfile CHANGED
@@ -1,151 +1,24 @@
1
- import http.server
2
- import socketserver
3
- import subprocess
4
- import threading
5
- import queue
6
- import io
7
- import select
8
- import re
9
- import os
10
- import urllib.request
11
- import urllib.parse
12
- import urllib.error
13
 
14
- PORT = 8080
15
- command_executed = False
16
- command_output = queue.Queue()
17
- command_queue = queue.Queue()
18
 
 
 
 
19
 
20
- def sanitize_output(output):
21
- # Remove any non-printable characters except for carriage return
22
- return re.sub(r'[^\x20-\x7E\r]+', '', output)
23
 
 
24
 
25
- def run_command(command):
26
- print("Running command ", command)
27
- process = subprocess.Popen(
28
- command,
29
- stdout=subprocess.PIPE,
30
- stderr=subprocess.PIPE,
31
- text=True,
32
- bufsize=1 # Line-buffered
33
- )
34
- while True:
35
- reads = [process.stdout.fileno(), process.stderr.fileno()]
36
- ret = select.select(reads, [], [], 0.1) # Timeout to periodically flush
37
- for fd in ret[0]:
38
- if fd == process.stdout.fileno():
39
- output = process.stdout.readline()
40
- if output:
41
- command_output.put(sanitize_output(output) + "\n")
42
- if fd == process.stderr.fileno():
43
- error = process.stderr.readline()
44
- if error:
45
- command_output.put(sanitize_output(error) + "\n")
46
- if process.poll() is not None:
47
- break
48
- process.stdout.flush()
49
- process.stderr.flush()
50
- process.stdout.close()
51
- process.stderr.close()
52
- process.wait()
53
- command_output.put("Leaving")
54
 
55
 
56
- def command_worker():
57
- print("Starting worker command")
58
- while True:
59
- command = command_queue.get()
60
- if command is None:
61
- break
62
- run_command(command)
63
- command_queue.task_done()
64
-
65
-
66
- def start_commands():
67
- commands = [
68
- ["echo", "## standard process"],
69
- ['./gradlew', 'jatsEval', '-Pp2t=/opt/grobid/evaluation/PMC_sample_1943', '-Prun=1', '-PfileRatio=1'],
70
- ["cp", "/opt/grobid/grobid-home/tmp/report.md", "/opt/grobid/grobid-home/tmp/report-pmc_sample_1943.md"],
71
- ['./gradlew', 'jatsEval', '-Pp2t=/opt/grobid/evaluation/PLOS_1000', '-Prun=1', '-PfileRatio=1'],
72
- ["cp", "/opt/grobid/grobid-home/tmp/report.md", "/opt/grobid/grobid-home/tmp/report-plos_1000.md"],
73
- ['./gradlew', 'jatsEval', '-Pp2t=/opt/grobid/evaluation/biorxiv-10k-test-2000', '-Prun=1', '-PfileRatio=1'],
74
- ["cp", "/opt/grobid/grobid-home/tmp/report.md", "/opt/grobid/grobid-home/tmp/report-biorxiv-10k-test-2000.md"],
75
- ['./gradlew', 'jatsEval', '-Pp2t=/opt/grobid/evaluation/eLife_984', '-Prun=1', '-PfileRatio=1'],
76
- ["cp", "/opt/grobid/grobid-home/tmp/report.md", "/opt/grobid/grobid-home/tmp/report-elife_984.md"],
77
- ["echo", "## article/light"],
78
- ['./gradlew', 'jatsEval', '-Pp2t=/opt/grobid/evaluation/PMC_sample_1943', '-Prun=1', '-PfileRatio=1', '-Pflavor=article/light'],
79
- ["cp", "/opt/grobid/grobid-home/tmp/report.md", "/opt/grobid/grobid-home/tmp/report-pmc_sample_1943-article_light.md"],
80
- ['./gradlew', 'jatsEval', '-Pp2t=/opt/grobid/evaluation/PLOS_1000', '-Prun=1', '-PfileRatio=1', '-Pflavor=article/light'],
81
- ["cp", "/opt/grobid/grobid-home/tmp/report.md", "/opt/grobid/grobid-home/tmp/report-plos_1000-article_light.md"],
82
- ['./gradlew', 'jatsEval', '-Pp2t=/opt/grobid/evaluation/biorxiv-10k-test-2000', '-Prun=1', '-PfileRatio=1', '-Pflavor=article/light'],
83
- ["cp", "/opt/grobid/grobid-home/tmp/report.md", "/opt/grobid/grobid-home/tmp/report-biorxiv-10k-test-2000-article_light.md"],
84
- ['./gradlew', 'jatsEval', '-Pp2t=/opt/grobid/evaluation/eLife_984', '-Prun=1', '-PfileRatio=1', '-Pflavor=article/light'],
85
- ["cp", "/opt/grobid/grobid-home/tmp/report.md", "/opt/grobid/grobid-home/tmp/report-eLife_984-article_light.md"],
86
- ["echo", "## article/light-ref"],
87
- ['./gradlew', 'jatsEval', '-Pp2t=/opt/grobid/evaluation/PMC_sample_1943', '-Prun=1', '-PfileRatio=1', '-Pflavor=article/light-ref'],
88
- ["cp", "/opt/grobid/grobid-home/tmp/report.md", "/opt/grobid/grobid-home/tmp/report-pmc_sample_1943-article_light_ref.md"],
89
- ['./gradlew', 'jatsEval', '-Pp2t=/opt/grobid/evaluation/PLOS_1000', '-Prun=1', '-PfileRatio=1', '-Pflavor=article/light-ref'],
90
- ["cp", "/opt/grobid/grobid-home/tmp/report.md", "/opt/grobid/grobid-home/tmp/report-plos_1000-article_light_ref.md"],
91
- ['./gradlew', 'jatsEval', '-Pp2t=/opt/grobid/evaluation/biorxiv-10k-test-2000', '-Prun=1', '-PfileRatio=1', '-Pflavor=article/light-ref'],
92
- ["cp", "/opt/grobid/grobid-home/tmp/report.md", "/opt/grobid/grobid-home/tmp/report-biorxiv-10k-test-2000-article_light_ref.md"],
93
- ['./gradlew', 'jatsEval', '-Pp2t=/opt/grobid/evaluation/eLife_984', '-Prun=1', '-PfileRatio=1', '-Pflavor=article/light-ref'],
94
- ["cp", "/opt/grobid/grobid-home/tmp/report.md", "/opt/grobid/grobid-home/tmp/report-eLife_984-article_light_ref.md"],
95
- ]
96
- for command in commands:
97
- command_queue.put(command)
98
- threading.Thread(target=command_worker).start()
99
-
100
-
101
- class Handler(http.server.SimpleHTTPRequestHandler):
102
- def do_GET(self):
103
- global command_executed
104
-
105
- # Check if the request is for downloading a file from URL
106
- if self.path.startswith('/fetch?filename='):
107
- try:
108
- # Parse the URL from the query string
109
- filename = urllib.parse.unquote(self.path.split('=', 1)[1])
110
-
111
- # Construct the full file path
112
- file_path = os.path.join('/opt/grobid/grobid-home/tmp/', filename)
113
-
114
- # Check if the file exists
115
- if not os.path.isfile(file_path):
116
- self.send_error(404, f"File '{filename}' not found.")
117
- return
118
-
119
- # Read the file content
120
- with open(file_path, 'r') as file:
121
- content = file.read()
122
-
123
- response = content
124
-
125
- except urllib.error.URLError as e:
126
- self.send_error(500, f"Failed to download file: {str(e)}")
127
- except Exception as e:
128
- self.send_error(500, f"An error occurred: {str(e)}")
129
-
130
- elif not command_executed:
131
- command_executed = True
132
- threading.Thread(target=start_commands).start()
133
- response = "Starting evaluation."
134
-
135
- else:
136
- response = io.StringIO()
137
- response.write("Command output:\n")
138
- for item in list(command_output.queue):
139
- response.write(item)
140
-
141
- response = response.getvalue()
142
-
143
- self.send_response(200)
144
- self.send_header("Content-type", "text/plain")
145
- self.end_headers()
146
- self.wfile.write(response.encode())
147
-
148
-
149
- with socketserver.TCPServer(("", PORT), Handler) as httpd:
150
- print(f"Serving on port {PORT}")
151
- httpd.serve_forever()
 
1
+ FROM lfoppiano/grobid-evaluation:latest
2
+ USER root
3
+ RUN mkdir -m 777 -p /opt/grobid/grobid-home/tmp
4
+ RUN mkdir -m 777 -p /opt/grobid/logs
5
+ RUN chmod -R 777 /opt/grobid
6
+ RUN chmod -R uog+rw /data/db
 
 
 
 
 
 
7
 
8
+ # CRF
9
+ COPY --chown=lfoppiano grobid.yaml /opt/grobid/grobid-home/config/grobid.yaml
 
 
10
 
11
+ # official evaluation configuration
12
+ COPY --chown=lfoppiano grobid-full.yml /opt/grobid/grobid-home/config/grobid.yaml
13
+ RUN ls -R /opt/grobid/evaluation/
14
 
15
+ WORKDIR /opt/grobid
16
+ RUN git lfs install && git clone --depth 1 https://huggingface.co/datasets/sciencialab/grobid-evaluation evaluation
17
+ RUN chmod -R uog+rw /opt/grobid/evaluation
18
 
19
+ COPY --chown=lfoppiano service.py /opt/grobid/service.py
20
 
21
+ WORKDIR /opt/grobid
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
 
24
+ CMD ["/bin/bash", "-c", "python service.py"]