import os import uuid import json from flask import Flask, request, jsonify, Response import pytesseract from pdf2image import convert_from_bytes from flask_cors import CORS from lib import ocr_2 as ocr from lib import llm_3_deepinfra as llm os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract-ocr/5/tessdata' app = Flask(__name__) CORS(app) UPLOAD_FOLDER = './tmp' app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER # Endpoint for uploading PDF and extracting text @app.route('/recognize', methods=['POST']) def upload_file(): # Check if the post request has the file part if 'file' not in request.files: return jsonify({'error': 'No file part'}) file = request.files['file'] # Check if the file is a PDF if file.filename == '': return jsonify({'error': 'No selected file'}) if file and file.filename.endswith('.pdf'): # Convert PDF to images # images = convert_from_bytes(file.read()) filename = str(uuid.uuid4()) + '.pdf' # Save the file to the temporary upload directory file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) # Construct and return the path where the file is saved temp_path = os.path.join(app.config['UPLOAD_FOLDER'], filename) text = '' # for img in images: # # Perform OCR on each page # text += pytesseract.image_to_string(img, lang='rus') docs_info = ocr.processSingleFile(temp_path) os.remove(temp_path) return Response(json.dumps(docs_info, sort_keys=False, ensure_ascii=False), content_type='application/json; charset=utf-8') else: return jsonify({'error': 'File must be a PDF'}) # Endpoint for uploading PDF and extracting text @app.route('/analize', methods=['POST']) async def analize(): # Get the text data from the request text_data = request.json.get('text') app_info = await llm.getApplicationInfo(text_data) result = { "application": app_info, "debug": {} } return Response(json.dumps(result, sort_keys=False, ensure_ascii=False), content_type='application/json; charset=utf-8') if __name__ == '__main__': app.run(debug=False)