lfoppiano commited on
Commit
90538e1
·
verified ·
1 Parent(s): 8c48418

Update Dockerfile

Browse files
Files changed (1) hide show
  1. Dockerfile +145 -17
Dockerfile CHANGED
@@ -1,23 +1,151 @@
1
- FROM lfoppiano/grobid-evaluation:latest
2
- USER root
3
- RUN mkdir -m 777 -p /opt/grobid/grobid-home/tmp
4
- RUN mkdir -m 777 -p /opt/grobid/logs
5
- RUN chmod -R 777 /opt/grobid
6
- RUN chmod -R uog+rw /data/db
 
 
 
 
 
 
7
 
8
- # CRF
9
- COPY --chown=lfoppiano grobid.yaml /opt/grobid/grobid-home/config/grobid.yaml
 
 
10
 
11
- # official evaluation configuration
12
- COPY --chown=lfoppiano grobid-full.yml /opt/grobid/grobid-home/config/grobid.yaml
13
- RUN ls -R /opt/grobid/evaluation/
14
 
15
- WORKDIR /opt/grobid
16
- RUN git lfs install && git clone --depth 1 https://huggingface.co/datasets/sciencialab/grobid-evaluation evaluation
17
- RUN chmod -R uog+rw /opt/grobid/evaluation
18
 
19
- COPY --chown=lfoppiano service.py /opt/grobid/service.py
20
 
21
- WORKDIR /opt/grobid
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- CMD ["/bin/bash", "-c", "python service.py"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import http.server
2
+ import socketserver
3
+ import subprocess
4
+ import threading
5
+ import queue
6
+ import io
7
+ import select
8
+ import re
9
+ import os
10
+ import urllib.request
11
+ import urllib.parse
12
+ import urllib.error
13
 
14
+ PORT = 8080
15
+ command_executed = False
16
+ command_output = queue.Queue()
17
+ command_queue = queue.Queue()
18
 
 
 
 
19
 
20
+ def sanitize_output(output):
21
+ # Remove any non-printable characters except for carriage return
22
+ return re.sub(r'[^\x20-\x7E\r]+', '', output)
23
 
 
24
 
25
+ def run_command(command):
26
+ print("Running command ", command)
27
+ process = subprocess.Popen(
28
+ command,
29
+ stdout=subprocess.PIPE,
30
+ stderr=subprocess.PIPE,
31
+ text=True,
32
+ bufsize=1 # Line-buffered
33
+ )
34
+ while True:
35
+ reads = [process.stdout.fileno(), process.stderr.fileno()]
36
+ ret = select.select(reads, [], [], 0.1) # Timeout to periodically flush
37
+ for fd in ret[0]:
38
+ if fd == process.stdout.fileno():
39
+ output = process.stdout.readline()
40
+ if output:
41
+ command_output.put(sanitize_output(output) + "\n")
42
+ if fd == process.stderr.fileno():
43
+ error = process.stderr.readline()
44
+ if error:
45
+ command_output.put(sanitize_output(error) + "\n")
46
+ if process.poll() is not None:
47
+ break
48
+ process.stdout.flush()
49
+ process.stderr.flush()
50
+ process.stdout.close()
51
+ process.stderr.close()
52
+ process.wait()
53
+ command_output.put("Leaving")
54
 
55
+
56
+ def command_worker():
57
+ print("Starting worker command")
58
+ while True:
59
+ command = command_queue.get()
60
+ if command is None:
61
+ break
62
+ run_command(command)
63
+ command_queue.task_done()
64
+
65
+
66
+ def start_commands():
67
+ commands = [
68
+ ["echo", "## standard process"],
69
+ ['./gradlew', 'jatsEval', '-Pp2t=/opt/grobid/evaluation/PMC_sample_1943', '-Prun=1', '-PfileRatio=1'],
70
+ ["cp", "/opt/grobid/grobid-home/tmp/report.md", "/opt/grobid/grobid-home/tmp/report-pmc_sample_1943.md"],
71
+ ['./gradlew', 'jatsEval', '-Pp2t=/opt/grobid/evaluation/PLOS_1000', '-Prun=1', '-PfileRatio=1'],
72
+ ["cp", "/opt/grobid/grobid-home/tmp/report.md", "/opt/grobid/grobid-home/tmp/report-plos_1000.md"],
73
+ ['./gradlew', 'jatsEval', '-Pp2t=/opt/grobid/evaluation/biorxiv-10k-test-2000', '-Prun=1', '-PfileRatio=1'],
74
+ ["cp", "/opt/grobid/grobid-home/tmp/report.md", "/opt/grobid/grobid-home/tmp/report-biorxiv-10k-test-2000.md"],
75
+ ['./gradlew', 'jatsEval', '-Pp2t=/opt/grobid/evaluation/eLife_984', '-Prun=1', '-PfileRatio=1'],
76
+ ["cp", "/opt/grobid/grobid-home/tmp/report.md", "/opt/grobid/grobid-home/tmp/report-elife_984.md"],
77
+ ["echo", "## article/light"],
78
+ ['./gradlew', 'jatsEval', '-Pp2t=/opt/grobid/evaluation/PMC_sample_1943', '-Prun=1', '-PfileRatio=1', '-Pflavor=article/light'],
79
+ ["cp", "/opt/grobid/grobid-home/tmp/report.md", "/opt/grobid/grobid-home/tmp/report-pmc_sample_1943-article_light.md"],
80
+ ['./gradlew', 'jatsEval', '-Pp2t=/opt/grobid/evaluation/PLOS_1000', '-Prun=1', '-PfileRatio=1', '-Pflavor=article/light'],
81
+ ["cp", "/opt/grobid/grobid-home/tmp/report.md", "/opt/grobid/grobid-home/tmp/report-plos_1000-article_light.md"],
82
+ ['./gradlew', 'jatsEval', '-Pp2t=/opt/grobid/evaluation/biorxiv-10k-test-2000', '-Prun=1', '-PfileRatio=1', '-Pflavor=article/light'],
83
+ ["cp", "/opt/grobid/grobid-home/tmp/report.md", "/opt/grobid/grobid-home/tmp/report-biorxiv-10k-test-2000-article_light.md"],
84
+ ['./gradlew', 'jatsEval', '-Pp2t=/opt/grobid/evaluation/eLife_984', '-Prun=1', '-PfileRatio=1', '-Pflavor=article/light'],
85
+ ["cp", "/opt/grobid/grobid-home/tmp/report.md", "/opt/grobid/grobid-home/tmp/report-eLife_984-article_light.md"],
86
+ ["echo", "## article/light-ref"],
87
+ ['./gradlew', 'jatsEval', '-Pp2t=/opt/grobid/evaluation/PMC_sample_1943', '-Prun=1', '-PfileRatio=1', '-Pflavor=article/light-ref'],
88
+ ["cp", "/opt/grobid/grobid-home/tmp/report.md", "/opt/grobid/grobid-home/tmp/report-pmc_sample_1943-article_light_ref.md"],
89
+ ['./gradlew', 'jatsEval', '-Pp2t=/opt/grobid/evaluation/PLOS_1000', '-Prun=1', '-PfileRatio=1', '-Pflavor=article/light-ref'],
90
+ ["cp", "/opt/grobid/grobid-home/tmp/report.md", "/opt/grobid/grobid-home/tmp/report-plos_1000-article_light_ref.md"],
91
+ ['./gradlew', 'jatsEval', '-Pp2t=/opt/grobid/evaluation/biorxiv-10k-test-2000', '-Prun=1', '-PfileRatio=1', '-Pflavor=article/light-ref'],
92
+ ["cp", "/opt/grobid/grobid-home/tmp/report.md", "/opt/grobid/grobid-home/tmp/report-biorxiv-10k-test-2000-article_light_ref.md"],
93
+ ['./gradlew', 'jatsEval', '-Pp2t=/opt/grobid/evaluation/eLife_984', '-Prun=1', '-PfileRatio=1', '-Pflavor=article/light-ref'],
94
+ ["cp", "/opt/grobid/grobid-home/tmp/report.md", "/opt/grobid/grobid-home/tmp/report-eLife_984-article_light_ref.md"],
95
+ ]
96
+ for command in commands:
97
+ command_queue.put(command)
98
+ threading.Thread(target=command_worker).start()
99
+
100
+
101
+ class Handler(http.server.SimpleHTTPRequestHandler):
102
+ def do_GET(self):
103
+ global command_executed
104
+
105
+ # Check if the request is for downloading a file from URL
106
+ if self.path.startswith('/fetch?filename='):
107
+ try:
108
+ # Parse the URL from the query string
109
+ filename = urllib.parse.unquote(self.path.split('=', 1)[1])
110
+
111
+ # Construct the full file path
112
+ file_path = os.path.join('/opt/grobid/grobid-home/tmp/', filename)
113
+
114
+ # Check if the file exists
115
+ if not os.path.isfile(file_path):
116
+ self.send_error(404, f"File '{filename}' not found.")
117
+ return
118
+
119
+ # Read the file content
120
+ with open(file_path, 'r') as file:
121
+ content = file.read()
122
+
123
+ response = content
124
+
125
+ except urllib.error.URLError as e:
126
+ self.send_error(500, f"Failed to download file: {str(e)}")
127
+ except Exception as e:
128
+ self.send_error(500, f"An error occurred: {str(e)}")
129
+
130
+ elif not command_executed:
131
+ command_executed = True
132
+ threading.Thread(target=start_commands).start()
133
+ response = "Starting evaluation."
134
+
135
+ else:
136
+ response = io.StringIO()
137
+ response.write("Command output:\n")
138
+ for item in list(command_output.queue):
139
+ response.write(item)
140
+
141
+ response = response.getvalue()
142
+
143
+ self.send_response(200)
144
+ self.send_header("Content-type", "text/plain")
145
+ self.end_headers()
146
+ self.wfile.write(response.encode())
147
+
148
+
149
+ with socketserver.TCPServer(("", PORT), Handler) as httpd:
150
+ print(f"Serving on port {PORT}")
151
+ httpd.serve_forever()