Spaces:

muryshev
/

cb-api

Sleeping

File size: 2,260 Bytes

b7484d7
 
58a1c58
a5177da
b7484d7
 
 
8bed2ce
f908316
b7484d7
 
 
2d019a6
 
b7484d7
 
 
 
 
 
eeebb29
b7484d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
071a451
b7484d7
 
 
071a451
b7484d7
 
 
eeebb29
 
c01b75e
eeebb29
 
c01b75e
01f0b80
 
 
 
 
eeebb29
b7484d7
2d019a6

import os
import uuid
import json
from flask import Flask, request, jsonify, Response
import pytesseract
from pdf2image import convert_from_bytes
from flask_cors import CORS
from lib import ocr_2 as ocr
from lib import llm_3_deepinfra as llm

os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract-ocr/5/tessdata'



app = Flask(__name__)
CORS(app)
UPLOAD_FOLDER = './tmp'
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER

# Endpoint for uploading PDF and extracting text
@app.route('/recognize', methods=['POST'])
def upload_file():
    # Check if the post request has the file part
    if 'file' not in request.files:
        return jsonify({'error': 'No file part'})

    file = request.files['file']

    # Check if the file is a PDF
    if file.filename == '':
        return jsonify({'error': 'No selected file'})
    if file and file.filename.endswith('.pdf'):
        # Convert PDF to images
        # images = convert_from_bytes(file.read())
        filename = str(uuid.uuid4()) + '.pdf'

        # Save the file to the temporary upload directory
        file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))

        # Construct and return the path where the file is saved
        temp_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
        
        text = ''
        # for img in images:
        #     # Perform OCR on each page
        #     text += pytesseract.image_to_string(img, lang='rus')
            
            
        docs_info = ocr.processSingleFile(temp_path) 
            
            
        os.remove(temp_path)
        return Response(json.dumps(docs_info, sort_keys=False, ensure_ascii=False), content_type='application/json; charset=utf-8')
    else:
        return jsonify({'error': 'File must be a PDF'})

# Endpoint for uploading PDF and extracting text
@app.route('/analize', methods=['POST'])
async def analize():
    # Get the text data from the request
    text_data = request.json.get('text')
    app_info = await llm.getApplicationInfo(text_data)
    result = {
        "application": app_info,
        "debug": {}
    }
    return Response(json.dumps(result, sort_keys=False, ensure_ascii=False), content_type='application/json; charset=utf-8')
    
if __name__ == '__main__':
    app.run(debug=False)