Spaces:

nimaidev
/

indictrans2-translator

Sleeping

App Files Files Community

indictrans2-translator / app.py

nimaic

initial commit

86f7d50 about 1 month ago

raw

history blame contribute delete

5.73 kB

	import time
	from fastapi import FastAPI
	from pydantic import BaseModel
	import torch
	from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig, AutoTokenizer
	from IndicTransToolkit.processor import IndicProcessor
	import signal
	import sys

	import uvicorn

	BATCH_SIZE = 4
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	quantization = None



	def initialize_model_and_tokenizer(ckpt_dir, quantization):
	if quantization == "4-bit":
	qconfig = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_compute_dtype=torch.bfloat16,
	)
	elif quantization == "8-bit":
	qconfig = BitsAndBytesConfig(
	load_in_8bit=True,
	bnb_8bit_use_double_quant=True,
	bnb_8bit_compute_dtype=torch.bfloat16,
	)
	else:
	qconfig = None

	tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True)
	model = AutoModelForSeq2SeqLM.from_pretrained(
	ckpt_dir,
	trust_remote_code=True,
	low_cpu_mem_usage=True,
	quantization_config=qconfig,
	)

	if qconfig is None:
	model = model.to(DEVICE)
	# Only use half precision if CUDA is available
	if DEVICE == "cuda" and torch.cuda.is_available():
	model.half()

	model.eval()

	return tokenizer, model


	def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):
	translations = []
	for i in range(0, len(input_sentences), BATCH_SIZE):
	batch = input_sentences[i : i + BATCH_SIZE]

	# Preprocess the batch and extract entity mappings
	batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)

	# Tokenize the batch and generate input encodings
	inputs = tokenizer(
	batch,
	truncation=True,
	padding="longest",
	return_tensors="pt",
	return_attention_mask=True,
	).to(DEVICE)

	# Generate translations using the model
	with torch.no_grad():
	generated_tokens = model.generate(
	**inputs,
	use_cache=True,
	min_length=0,
	max_length=256,
	num_beams=4,
	num_return_sequences=1,
	)

	# Decode the generated tokens into text
	generated_tokens = tokenizer.batch_decode(
	generated_tokens,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=True,
	)

	# Postprocess the translations, including entity replacement
	translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)

	del inputs
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	return translations


	# en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-1B" # ai4bharat/indictrans2-en-indic-dist-200M
	en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-dist-200M"

	en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(en_indic_ckpt_dir, quantization)

	indic_en_ckpt_dir = "ai4bharat/indictrans2-indic-en-dist-200M"

	indic_en_tokenizer, indic_en_model = initialize_model_and_tokenizer(indic_en_ckpt_dir, quantization)

	ip = IndicProcessor(inference=True)


	app = FastAPI()


	class Translate(BaseModel):
	input_sentence : str
	source_lan : str
	target_lang: str


	lang_list = [
	"eng_Latn", # Latin English
	"ben_Beng", # Bengali
	"pan_Guru", # Punjabi
	"asm_Beng", # Assamese
	"gom_Deva", # Konkani
	"guj_Gujr", # Gujarati
	"hin_Deva", # Hindi
	"kan_Knda", # Kannada,
	"mal_Mlym", # Malayalam
	"ory_Orya", # Odia,
	"tam_Taml", # Tamil,
	"tel_Telu", # Telugu
	]

	# post method to translate
	@app.post("/api/v1/translate")
	def translate(input : Translate):# -> dict[str, Any]:
	# start time
	start_time = time.time()
	if input.source_lan not in lang_list or input.target_lang not in lang_list:
	return {
	"message" : "Not a valid dialect",
	"translation": None
	}

	model = None
	tokenizer = None
	if input.target_lang == "eng_Latn":
	model = indic_en_model
	tokenizer = indic_en_tokenizer
	else:
	model = en_indic_model
	tokenizer = en_indic_tokenizer
	translation = batch_translate(
	[input.input_sentence], # Note: batch_translate expects a list
	src_lang=input.source_lan,
	tgt_lang=input.target_lang,
	model=model,
	tokenizer=tokenizer,
	ip=ip # Don't forget to pass the ip parameter
	)
	# Calculate processing time
	end_time = time.time()
	processing_time = round(end_time - start_time, 2)
	return {
	"message" : f"translation processed successfully in {processing_time} seconds",
	"translation": translation[0]
	}

	@app.get("/health")
	def health_check():
	return {
	"status": "healthy",
	"gpu_available": torch.cuda.is_available(),
	"gpu_count": torch.cuda.device_count() if torch.cuda.is_available() else 0
	}


	# Signal handler for graceful shutdown
	def handle_sigterm(signum, frame):
	print("Received SIGTERM signal. Cleaning up models and exiting...")

	# Delete models to free GPU memory
	global en_indic_tokenizer, en_indic_model, indic_en_tokenizer, indic_en_model
	del en_indic_tokenizer, en_indic_model
	del indic_en_tokenizer, indic_en_model

	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	sys.exit(0)

	# Register the signal handler
	signal.signal(signal.SIGTERM, handle_sigterm)


	if __name__ == "__main__":
	uvicorn.run(app, host="0.0.0.0", port=9000)