cb-api / app.py
muryshev's picture
async fixes
c01b75e
import os
import uuid
import json
from flask import Flask, request, jsonify, Response
import pytesseract
from pdf2image import convert_from_bytes
from flask_cors import CORS
from lib import ocr_2 as ocr
from lib import llm_3_deepinfra as llm
os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract-ocr/5/tessdata'
app = Flask(__name__)
CORS(app)
UPLOAD_FOLDER = './tmp'
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
# Endpoint for uploading PDF and extracting text
@app.route('/recognize', methods=['POST'])
def upload_file():
# Check if the post request has the file part
if 'file' not in request.files:
return jsonify({'error': 'No file part'})
file = request.files['file']
# Check if the file is a PDF
if file.filename == '':
return jsonify({'error': 'No selected file'})
if file and file.filename.endswith('.pdf'):
# Convert PDF to images
# images = convert_from_bytes(file.read())
filename = str(uuid.uuid4()) + '.pdf'
# Save the file to the temporary upload directory
file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
# Construct and return the path where the file is saved
temp_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
text = ''
# for img in images:
# # Perform OCR on each page
# text += pytesseract.image_to_string(img, lang='rus')
docs_info = ocr.processSingleFile(temp_path)
os.remove(temp_path)
return Response(json.dumps(docs_info, sort_keys=False, ensure_ascii=False), content_type='application/json; charset=utf-8')
else:
return jsonify({'error': 'File must be a PDF'})
# Endpoint for uploading PDF and extracting text
@app.route('/analize', methods=['POST'])
async def analize():
# Get the text data from the request
text_data = request.json.get('text')
app_info = await llm.getApplicationInfo(text_data)
result = {
"application": app_info,
"debug": {}
}
return Response(json.dumps(result, sort_keys=False, ensure_ascii=False), content_type='application/json; charset=utf-8')
if __name__ == '__main__':
app.run(debug=False)