Spaces:
Sleeping
Sleeping
File size: 2,874 Bytes
d120873 37ded96 d120873 d22cb09 d120873 d22cb09 95b5309 d22cb09 b8db721 95b5309 49ae858 d22cb09 49ae858 95b5309 d22cb09 37ded96 2e5046d 088720e e7d7478 088720e 2e5046d e7d7478 088720e e7d7478 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import src.exception.Exception.Exception as ExceptionCustom
# Use a pipeline as a high-level helper
from transformers import pipeline
METHOD = "TRANSLATE"
def paraphraseTranslateMethod(requestValue: str, model: str):
exception = ExceptionCustom.checkForException(requestValue, METHOD)
if exception:
return "", exception
tokenized_sent_list = sent_tokenize(requestValue)
result_value = []
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
for SENTENCE in tokenized_sent_list:
if model == 'roen':
tokenizerROMENG = AutoTokenizer.from_pretrained("BlackKakapo/opus-mt-ro-en")
modelROMENG = AutoModelForSeq2SeqLM.from_pretrained("BlackKakapo/opus-mt-ro-en")
modelROMENG.to(device)
input_ids = tokenizerROMENG(SENTENCE, return_tensors='pt').to(device)
output = modelROMENG.generate(
input_ids=input_ids.input_ids,
do_sample=True,
max_length=512,
top_k=90,
top_p=0.97,
early_stopping=False
)
result = tokenizerROMENG.batch_decode(output, skip_special_tokens=True)[0]
else:
tokenizerENGROM = AutoTokenizer.from_pretrained("BlackKakapo/opus-mt-en-ro")
modelENGROM = AutoModelForSeq2SeqLM.from_pretrained("BlackKakapo/opus-mt-en-ro")
modelENGROM.to(device)
input_ids = tokenizerENGROM(SENTENCE, return_tensors='pt').to(device)
output = modelENGROM.generate(
input_ids=input_ids.input_ids,
do_sample=True,
max_length=512,
top_k=90,
top_p=0.97,
early_stopping=False
)
result = tokenizerENGROM.batch_decode(output, skip_special_tokens=True)[0]
result_value.append(result)
return " ".join(result_value).strip(), model
def gemma(requestValue: str, model: str = 'Gargaz/gemma-2b-romanian-better'):
prompt = f"Translate this to Romanian using a formal tone. Only return the translation: {requestValue}"
messages = [{"role": "user", "content": f"Translate this text to Romanian using a formal tone. Only return the translated text: {requestValue}"}]
if '/' not in model:
model = 'Gargaz/gemma-2b-romanian-better'
pipe = pipeline(
"text-generation",
model=model,
device=-1,
max_new_tokens=256, # Keep short to reduce verbosity
do_sample=False # Use greedy decoding for determinism
)
output = pipe(messages, num_return_sequences=1, return_full_text=False)
# return output[0]["generated_text"].strip(), model
return output, model |