Spaces:
Configuration error
Configuration error
File size: 10,414 Bytes
98d3f44 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 |
from twisted.web.static import File
from twisted.web.resource import Resource
from twisted.web.server import Site, NOT_DONE_YET
from twisted.internet import reactor, threads
from twisted.web._responses import FOUND
import json
import logging
import multiprocessing
import os
import shutil
import uuid
import wave
from gentle.util.paths import get_resource, get_datadir
from gentle.util.cyst import Insist
import gentle
class TranscriptionStatus(Resource):
def __init__(self, status_dict):
self.status_dict = status_dict
Resource.__init__(self)
def render_GET(self, req):
req.setHeader(b"Content-Type", "application/json")
return json.dumps(self.status_dict).encode()
class Transcriber():
def __init__(self, data_dir, nthreads=4, ntranscriptionthreads=2):
self.data_dir = data_dir
self.nthreads = nthreads
self.ntranscriptionthreads = ntranscriptionthreads
self.resources = gentle.Resources()
self.full_transcriber = gentle.FullTranscriber(self.resources, nthreads=ntranscriptionthreads)
self._status_dicts = {}
def get_status(self, uid):
return self._status_dicts.setdefault(uid, {})
def out_dir(self, uid):
return os.path.join(self.data_dir, 'transcriptions', uid)
# TODO(maxhawkins): refactor so this is returned by transcribe()
def next_id(self):
uid = None
while uid is None or os.path.exists(os.path.join(self.data_dir, uid)):
uid = uuid.uuid4().hex[:8]
return uid
def transcribe(self, uid, transcript, audio, async_mode, **kwargs):
status = self.get_status(uid)
status['status'] = 'STARTED'
output = {
'transcript': transcript
}
outdir = os.path.join(self.data_dir, 'transcriptions', uid)
tran_path = os.path.join(outdir, 'transcript.txt')
with open(tran_path, 'w') as tranfile:
tranfile.write(transcript)
audio_path = os.path.join(outdir, 'upload')
with open(audio_path, 'wb') as wavfile:
wavfile.write(audio)
status['status'] = 'ENCODING'
wavfile = os.path.join(outdir, 'a.wav')
if gentle.resample(os.path.join(outdir, 'upload'), wavfile) != 0:
status['status'] = 'ERROR'
status['error'] = "Encoding failed. Make sure that you've uploaded a valid media file."
# Save the status so that errors are recovered on restart of the server
# XXX: This won't work, because the endpoint will override this file
with open(os.path.join(outdir, 'status.json'), 'w') as jsfile:
json.dump(status, jsfile, indent=2)
return
#XXX: Maybe we should pass this wave object instead of the
# file path to align_progress
wav_obj = wave.open(wavfile, 'rb')
status['duration'] = wav_obj.getnframes() / float(wav_obj.getframerate())
status['status'] = 'TRANSCRIBING'
def on_progress(p):
print(p)
for k,v in p.items():
status[k] = v
if len(transcript.strip()) > 0:
trans = gentle.ForcedAligner(self.resources, transcript, nthreads=self.nthreads, **kwargs)
elif self.full_transcriber.available:
trans = self.full_transcriber
else:
status['status'] = 'ERROR'
status['error'] = 'No transcript provided and no language model for full transcription'
return
output = trans.transcribe(wavfile, progress_cb=on_progress, logging=logging)
# ...remove the original upload
os.unlink(os.path.join(outdir, 'upload'))
# Save
with open(os.path.join(outdir, 'align.json'), 'w') as jsfile:
jsfile.write(output.to_json(indent=2))
with open(os.path.join(outdir, 'align.csv'), 'w') as csvfile:
csvfile.write(output.to_csv())
# Inline the alignment into the index.html file.
htmltxt = open(get_resource('www/view_alignment.html')).read()
htmltxt = htmltxt.replace("var INLINE_JSON;", "var INLINE_JSON=%s;" % (output.to_json()));
open(os.path.join(outdir, 'index.html'), 'w').write(htmltxt)
status['status'] = 'OK'
logging.info('done with transcription.')
return output
class TranscriptionsController(Resource):
def __init__(self, transcriber):
Resource.__init__(self)
self.transcriber = transcriber
def getChild(self, uid, req):
uid = uid.decode()
out_dir = self.transcriber.out_dir(uid)
trans_ctrl = File(out_dir)
# Add a Status endpoint to the file
trans_status = TranscriptionStatus(self.transcriber.get_status(uid))
trans_ctrl.putChild(b"status.json", trans_status)
return trans_ctrl
def render_POST(self, req):
uid = self.transcriber.next_id()
tran = req.args.get(b'transcript', [b''])[0].decode()
audio = req.args[b'audio'][0]
disfluency = True if b'disfluency' in req.args else False
conservative = True if b'conservative' in req.args else False
kwargs = {'disfluency': disfluency,
'conservative': conservative,
'disfluencies': set(['uh', 'um'])}
async_mode = True
if b'async' in req.args and req.args[b'async'][0] == b'false':
async_mode = False
# We need to make the transcription directory here, so that
# when we redirect the user we are sure that there's a place
# for them to go.
outdir = os.path.join(self.transcriber.data_dir, 'transcriptions', uid)
os.makedirs(outdir)
# Copy over the HTML
shutil.copy(get_resource('www/view_alignment.html'), os.path.join(outdir, 'index.html'))
result_promise = threads.deferToThreadPool(
reactor, reactor.getThreadPool(),
self.transcriber.transcribe,
uid, tran, audio, async_mode, **kwargs)
if not async_mode:
def write_result(result):
'''Write JSON to client on completion'''
req.setHeader("Content-Type", "application/json")
req.write(result.to_json(indent=2).encode())
req.finish()
result_promise.addCallback(write_result)
result_promise.addErrback(lambda _: None) # ignore errors
req.notifyFinish().addErrback(lambda _: result_promise.cancel())
return NOT_DONE_YET
req.setResponseCode(FOUND)
req.setHeader(b"Location", "/transcriptions/%s" % (uid))
return b''
class LazyZipper(Insist):
def __init__(self, cachedir, transcriber, uid):
self.transcriber = transcriber
self.uid = uid
Insist.__init__(self, os.path.join(cachedir, '%s.zip' % (uid)))
def serialize_computation(self, outpath):
shutil.make_archive('.'.join(outpath.split('.')[:-1]), # We need to strip the ".zip" from the end
"zip", # ...because `shutil.make_archive` adds it back
os.path.join(self.transcriber.out_dir(self.uid)))
class TranscriptionZipper(Resource):
def __init__(self, cachedir, transcriber):
self.cachedir = cachedir
self.transcriber = transcriber
Resource.__init__(self)
def getChild(self, path, req):
uid = path.decode().split('.')[0]
t_dir = self.transcriber.out_dir(uid)
if os.path.exists(t_dir):
# TODO: Check that "status" is complete and only create a LazyZipper if so
# Otherwise, we could have incomplete transcriptions that get permanently zipped.
# For now, a solution will be hiding the button in the client until it's done.
lz = LazyZipper(self.cachedir, self.transcriber, uid)
if not isinstance(path, bytes):
path = path.encode()
self.putChild(path, lz)
return lz
else:
return Resource.getChild(self, path, req)
def serve(port=8765, interface='0.0.0.0', installSignalHandlers=0, nthreads=4, ntranscriptionthreads=2, data_dir=get_datadir('webdata')):
logging.info("SERVE %d, %s, %d", port, interface, installSignalHandlers)
if not os.path.exists(data_dir):
os.makedirs(data_dir)
zip_dir = os.path.join(data_dir, 'zip')
if not os.path.exists(zip_dir):
os.makedirs(zip_dir)
f = File(data_dir)
f.putChild(b'', File(get_resource('www/index.html')))
f.putChild(b'status.html', File(get_resource('www/status.html')))
f.putChild(b'preloader.gif', File(get_resource('www/preloader.gif')))
trans = Transcriber(data_dir, nthreads=nthreads, ntranscriptionthreads=ntranscriptionthreads)
trans_ctrl = TranscriptionsController(trans)
f.putChild(b'transcriptions', trans_ctrl)
trans_zippr = TranscriptionZipper(zip_dir, trans)
f.putChild(b'zip', trans_zippr)
s = Site(f)
logging.info("about to listen")
reactor.listenTCP(port, s, interface=interface)
logging.info("listening")
reactor.run(installSignalHandlers=installSignalHandlers)
if __name__=='__main__':
import argparse
parser = argparse.ArgumentParser(
description='Align a transcript to audio by generating a new language model.')
parser.add_argument('--host', default="0.0.0.0",
help='host to run http server on')
parser.add_argument('--port', default=8765, type=int,
help='port number to run http server on')
parser.add_argument('--nthreads', default=multiprocessing.cpu_count(), type=int,
help='number of alignment threads')
parser.add_argument('--ntranscriptionthreads', default=2, type=int,
help='number of full-transcription threads (memory intensive)')
parser.add_argument('--log', default="INFO",
help='the log level (DEBUG, INFO, WARNING, ERROR, or CRITICAL)')
args = parser.parse_args()
log_level = args.log.upper()
logging.getLogger().setLevel(log_level)
logging.info('gentle %s' % (gentle.__version__))
logging.info('listening at %s:%d\n' % (args.host, args.port))
serve(args.port, args.host, nthreads=args.nthreads, ntranscriptionthreads=args.ntranscriptionthreads, installSignalHandlers=1)
|