Spaces:

parthgajera
/

description

Sleeping

File size: 34,962 Bytes

# import os
# import nltk
# import asyncio
# import torch
# import logging
# from nltk.tokenize import sent_tokenize
# from transformers import PegasusForConditionalGeneration, PegasusTokenizer
# from typing import List

# # Configure logging
# logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger(__name__)

# # Load optional secret key (e.g., for logging/monitoring access)
# API_KEY = os.getenv("API_KEY")
# if API_KEY:
#     logger.info("API_KEY loaded successfully.")
# else:
#     logger.warning("API_KEY not found. You may set it via Hugging Face secrets.")

# # NLTK setup
# nltk_data_path = os.getenv("NLTK_DATA", "/app/nltk_data")
# nltk.data.path.append(nltk_data_path)

# # Download required tokenizer
# try:
#     nltk.data.find("tokenizers/punkt")
# except LookupError:
#     nltk.download("punkt", download_dir=nltk_data_path)

# # Load Pegasus model and tokenizer
# try:
#     logger.info("Loading Pegasus model from /app/pegasus_model...")
#     pegasus_model = PegasusForConditionalGeneration.from_pretrained("/app/pegasus_model")
#     tokenizer = PegasusTokenizer.from_pretrained("/app/pegasus_model")
#     logger.info("Pegasus model loaded successfully.")
# except Exception as e:
#     logger.error(f"Error loading Pegasus model: {e}")
#     raise

# # Generation config
# MAX_TOKENS = 1024
# TEMPERATURE = 0.9
# TOP_K = 50
# TOP_P = 0.95
# NUM_BEAMS = 3

# def split_into_sentences(text: str) -> List[str]:
#     """Split text into sentences while preserving paragraph breaks."""
#     sentences = []
#     for paragraph in text.split('\n'):
#         if paragraph.strip():
#             sentences.extend(sent_tokenize(paragraph))
#         else:
#             sentences.append('')  # preserve empty lines
#     return sentences

# async def paraphrase_sentence(sentence: str) -> str:
#     """Paraphrase a single sentence using Pegasus."""
#     if not sentence.strip():
#         return sentence
#     try:
#         inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
#         outputs = pegasus_model.generate(
#             **inputs,
#             max_length=MAX_TOKENS,
#             num_beams=NUM_BEAMS,
#             early_stopping=True,
#             temperature=TEMPERATURE,
#             top_k=TOP_K,
#             top_p=TOP_P,
#             do_sample=True
#         )
#         paraphrased = tokenizer.decode(outputs[0], skip_special_tokens=True)

#         # Ensure meaning is preserved (not too short, not identical)
#         if paraphrased.lower() != sentence.lower() and len(paraphrased.split()) >= len(sentence.split()) * 0.7:
#             return paraphrased
#     except Exception as e:
#         logger.error(f"Failed to paraphrase sentence: {e}")
#     return sentence

# async def paraphrase_paragraph(paragraph: str) -> str:
#     """Paraphrase each sentence within a paragraph."""
#     if not paragraph.strip():
#         return paragraph
#     sentences = sent_tokenize(paragraph)
#     paraphrased_sentences = await asyncio.gather(*[paraphrase_sentence(s) for s in sentences])
#     return ' '.join(paraphrased_sentences)

# async def get_paraphrased_text(text: str) -> str:
#     """Main interface: paraphrase a long multi-paragraph text."""
#     if not text.strip():
#         return text
#     paragraphs = text.split('\n')
#     paraphrased_paragraphs = await asyncio.gather(*[paraphrase_paragraph(p) for p in paragraphs])
#     return '\n'.join(paraphrased_paragraphs)




###-------------- working properly! -----------------------

# import os
# import nltk
# import asyncio
# import torch
# import logging
# from nltk.tokenize import sent_tokenize
# from transformers import PegasusForConditionalGeneration, PegasusTokenizer
# from typing import List

# # Configure logging
# logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger(__name__)

# # Optional: Hugging Face secrets
# API_KEY = os.getenv("API_KEY")
# if API_KEY:
#     logger.info("API_KEY loaded successfully.")
# else:
#     logger.warning("API_KEY not found. You may set it via Hugging Face secrets.")

# # NLTK setup
# nltk_data_path = os.getenv("NLTK_DATA", "/app/nltk_data")
# nltk.data.path.append(nltk_data_path)

# try:
#     nltk.data.find("tokenizers/punkt")
# except LookupError:
#     nltk.download("punkt", download_dir=nltk_data_path)

# # Load model on CPU with optimizations
# torch_device = "cpu"
# model_name = "tuner007/pegasus_paraphrase"

# try:
#     logger.info(f"Loading Pegasus model '{model_name}' on CPU...")
#     tokenizer = PegasusTokenizer.from_pretrained(model_name)
#     pegasus_model = PegasusForConditionalGeneration.from_pretrained(
#         model_name,
#         torch_dtype=torch.float32,
#         low_cpu_mem_usage=True
#     ).to(torch_device).eval()
#     logger.info("Model loaded successfully.")
# except Exception as e:
#     logger.error(f"Error loading model: {e}")
#     raise

# # Generation config
# MAX_TOKENS = 1024
# NUM_BEAMS = 3
# TEMPERATURE = 1.0
# TOP_K = 50
# TOP_P = 0.95

# def split_into_sentences(text: str) -> List[str]:
#     """Split text into sentences while preserving paragraph breaks."""
#     sentences = []
#     for paragraph in text.split('\n'):
#         if paragraph.strip():
#             sentences.extend(sent_tokenize(paragraph))
#         else:
#             sentences.append('')  # preserve empty lines
#     return sentences

# async def paraphrase_sentence(sentence: str) -> str:
#     """Paraphrase a single sentence using Pegasus."""
#     if not sentence.strip():
#         return sentence
#     try:
#         inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True).to(torch_device)
#         outputs = pegasus_model.generate(
#             **inputs,
#             max_length=MAX_TOKENS,
#             num_beams=NUM_BEAMS,
#             early_stopping=True,
#             do_sample=False,
#             temperature=TEMPERATURE,
#             top_k=TOP_K,
#             top_p=TOP_P
#         )
#         paraphrased = tokenizer.decode(outputs[0], skip_special_tokens=True)

#         # Filter out poor-quality paraphrases
#         if paraphrased.lower() != sentence.lower() and len(paraphrased.split()) >= len(sentence.split()) * 0.7:
#             return paraphrased
#     except Exception as e:
#         logger.error(f"Failed to paraphrase sentence: {e}")
#     return sentence

# async def paraphrase_paragraph(paragraph: str) -> str:
#     """Paraphrase each sentence within a paragraph."""
#     if not paragraph.strip():
#         return paragraph
#     sentences = sent_tokenize(paragraph)
#     paraphrased_sentences = await asyncio.gather(*[paraphrase_sentence(s) for s in sentences])
#     return ' '.join(paraphrased_sentences)

# async def get_paraphrased_text(text: str) -> str:
#     """Main interface: paraphrase a long multi-paragraph text."""
#     if not text.strip():
#         return text
#     paragraphs = text.split('\n')
#     paraphrased_paragraphs = await asyncio.gather(*[paraphrase_paragraph(p) for p in paragraphs])
#     return '\n'.join(paraphrased_paragraphs)




##### update #####

# import os
# import nltk
# import asyncio
# import torch
# import logging
# from typing import List
# from nltk.tokenize import sent_tokenize
# from transformers import PegasusForConditionalGeneration, PegasusTokenizer

# # Setup logging
# logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger(__name__)

# # Load optional API key (for HF Spaces secrets if used)
# API_KEY = os.getenv("API_KEY")
# if API_KEY:
#     logger.info("API_KEY loaded successfully.")
# else:
#     logger.warning("API_KEY not found. Continuing without it.")

# # Ensure NLTK data is available
# nltk_data_path = os.getenv("NLTK_DATA", "/app/nltk_data")
# nltk.data.path.append(nltk_data_path)
# try:
#     nltk.data.find("tokenizers/punkt")
# except LookupError:
#     nltk.download("punkt", download_dir=nltk_data_path)

# # Model setup
# MAX_TOKENS = 128  # lower max length for faster response
# MAX_INPUT_LENGTH = 60
# NUM_BEAMS = 3
# torch_device = "cpu"
# model_name = "tuner007/pegasus_paraphrase"

# logger.info(f"Loading model '{model_name}'...")
# tokenizer = PegasusTokenizer.from_pretrained(model_name)
# model = PegasusForConditionalGeneration.from_pretrained(
#     model_name,
#     torch_dtype=torch.float32,
#     low_cpu_mem_usage=True
# ).to(torch_device).eval()

# # --- Utilities ---

# def split_into_sentences(text: str) -> List[str]:
#     """Preserve paragraph structure while splitting into sentences."""
#     sentences = []
#     for para in text.split('\n'):
#         if para.strip():
#             sentences.extend(sent_tokenize(para))
#         else:
#             sentences.append('')  # blank line = paragraph break
#     return sentences

# def chunk_sentence(sentence: str, max_words: int = 50) -> List[str]:
#     """Break long sentence into smaller chunks."""
#     words = sentence.split()
#     if len(words) <= max_words:
#         return [sentence]
#     return [' '.join(words[i:i+max_words]) for i in range(0, len(words), max_words)]

# # --- Paraphrasing ---

# async def paraphrase_sentence(sentence: str) -> str:
#     """Paraphrase a single sentence or chunk."""
#     if not sentence.strip():
#         return sentence

#     chunks = chunk_sentence(sentence)
#     rewritten_chunks = []

#     for chunk in chunks:
#         try:
#             inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=MAX_INPUT_LENGTH).to(torch_device)

#             if inputs.input_ids.shape[1] > MAX_INPUT_LENGTH:
#                 logger.warning("Chunk too long, skipping.")
#                 rewritten_chunks.append(chunk)
#                 continue

#             outputs = model.generate(
#                 **inputs,
#                 max_length=MAX_TOKENS,
#                 num_beams=NUM_BEAMS,
#                 early_stopping=True,
#                 do_sample=False,
#             )
#             result = tokenizer.decode(outputs[0], skip_special_tokens=True)

#             if result.lower() != chunk.lower() and len(result.split()) >= len(chunk.split()) * 0.7:
#                 rewritten_chunks.append(result)
#             else:
#                 rewritten_chunks.append(chunk)

#         except Exception as e:
#             logger.error(f"Error during paraphrase: {e}")
#             rewritten_chunks.append(chunk)

#     return ' '.join(rewritten_chunks)

# async def paraphrase_paragraph(paragraph: str) -> str:
#     """Process each sentence in a paragraph."""
#     if not paragraph.strip():
#         return paragraph
#     sentences = sent_tokenize(paragraph)
#     rewritten = await asyncio.gather(*(paraphrase_sentence(s) for s in sentences))
#     return ' '.join(rewritten)

# async def get_paraphrased_text(text: str) -> str:
#     """Main method to rewrite input while preserving structure."""
#     if not text.strip():
#         return text
#     paragraphs = text.split('\n')
#     rewritten = await asyncio.gather(*(paraphrase_paragraph(p) for p in paragraphs))
#     return '\n'.join(rewritten)



# import os
# import nltk
# import asyncio
# import torch
# import logging
# from typing import List
# from nltk.tokenize import sent_tokenize
# from transformers import PegasusForConditionalGeneration, PegasusTokenizer

# # Setup logging
# logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger(__name__)

# # Optional API key (e.g., for Hugging Face secrets)
# API_KEY = os.getenv("API_KEY")
# if API_KEY:
#     logger.info("API_KEY loaded successfully.")
# else:
#     logger.warning("API_KEY not found. Continuing without it.")

# # Ensure NLTK tokenizer is available
# nltk_data_path = os.getenv("NLTK_DATA", "/app/nltk_data")
# nltk.data.path.append(nltk_data_path)
# try:
#     nltk.data.find("tokenizers/punkt")
# except LookupError:
#     nltk.download("punkt", download_dir=nltk_data_path)

# # Model configuration
# MAX_TOKENS = 128  # Max output length
# MAX_INPUT_LENGTH = 60  # Max input token length per chunk
# NUM_BEAMS = 3
# torch_device = "cpu"
# model_name = "tuner007/pegasus_paraphrase"

# logger.info(f"Loading model '{model_name}'...")
# tokenizer = PegasusTokenizer.from_pretrained(model_name)
# model = PegasusForConditionalGeneration.from_pretrained(
#     model_name,
#     torch_dtype=torch.float32,
#     low_cpu_mem_usage=True
# ).to(torch_device).eval()


# # ----------- Utilities -----------

# def split_into_sentences(text: str) -> List[str]:
#     """Preserve paragraph breaks while tokenizing into sentences."""
#     sentences = []
#     for para in text.split('\n'):
#         if para.strip():
#             sentences.extend(sent_tokenize(para))
#         else:
#             sentences.append('')  # preserve paragraph spacing
#     return sentences

# def chunk_sentence(sentence: str, max_words: int = 50) -> List[str]:
#     """Split very long sentences into smaller word chunks."""
#     words = sentence.split()
#     if len(words) <= max_words:
#         return [sentence]
#     return [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)]


# # ----------- Core Paraphrasing -----------

# async def paraphrase_sentence(sentence: str) -> str:
#     """Paraphrase a sentence or its smaller chunks if long."""
#     if not sentence.strip():
#         return sentence  # preserve blank lines

#     chunks = chunk_sentence(sentence)
#     rewritten_chunks = []

#     for chunk in chunks:
#         try:
#             inputs = tokenizer(
#                 chunk,
#                 return_tensors="pt",
#                 truncation=True,
#                 max_length=MAX_INPUT_LENGTH,
#             ).to(torch_device)

#             if inputs.input_ids.shape[1] > MAX_INPUT_LENGTH:
#                 logger.warning(f"Chunk too long, skipping: {chunk}")
#                 rewritten_chunks.append(chunk)
#                 continue

#             outputs = model.generate(
#                 **inputs,
#                 max_length=MAX_TOKENS,
#                 num_beams=NUM_BEAMS,
#                 early_stopping=True,
#                 do_sample=False,
#             )

#             result = tokenizer.decode(outputs[0], skip_special_tokens=True)

#             # Sanity check: avoid broken or poor rewrites
#             if (
#                 result.lower() != chunk.lower()
#                 and len(result.split()) >= max(3, int(len(chunk.split()) * 0.6))
#                 and not any(phrase in result.lower() for phrase in ["is a type of", "are 200", "the man is named"])
#             ):
#                 rewritten_chunks.append(result)
#             else:
#                 logger.warning(f"Low-quality rewrite or too similar: '{result}' <- '{chunk}'")
#                 rewritten_chunks.append(chunk)

#         except Exception as e:
#             logger.error(f"Error during paraphrasing: {e}")
#             rewritten_chunks.append(chunk)

#     return ' '.join(rewritten_chunks)


# async def paraphrase_paragraph(paragraph: str) -> str:
#     """Rewrite each sentence within a paragraph."""
#     if not paragraph.strip():
#         return paragraph
#     sentences = sent_tokenize(paragraph)
#     rewritten_sentences = await asyncio.gather(*[paraphrase_sentence(s) for s in sentences])
#     return ' '.join(rewritten_sentences)


# async def get_paraphrased_text(text: str) -> str:
#     """Rewrite full text input while preserving paragraph structure."""
#     if not text.strip():
#         return text
#     paragraphs = text.split('\n')
#     rewritten_paragraphs = await asyncio.gather(*[paraphrase_paragraph(p) for p in paragraphs])
#     return '\n'.join(rewritten_paragraphs)



#### --------------------------------- use the bitsandbytes INT8 quantization with transformers and accelerate  ------------------------------------

# import os
# import nltk
# import asyncio
# import torch
# import logging
# from typing import List
# from nltk.tokenize import sent_tokenize
# from transformers import PegasusForConditionalGeneration, PegasusTokenizer

# # Limit CPU threads for performance tuning (important in 2vCPU env)
# torch.set_num_threads(2)

# # Setup logging
# logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger(__name__)

# API_KEY = os.getenv("API_KEY")
# if API_KEY:
#     logger.info("API_KEY loaded successfully.")
# else:
#     logger.warning("API_KEY not found. Continuing without it.")

# # Ensure punkt tokenizer is available
# nltk_data_path = os.getenv("NLTK_DATA", "/app/nltk_data")
# nltk.data.path.append(nltk_data_path)
# try:
#     nltk.data.find("tokenizers/punkt")
# except LookupError:
#     nltk.download("punkt", download_dir=nltk_data_path)

# MAX_TOKENS = 128
# MAX_INPUT_LENGTH = 60
# NUM_BEAMS = 3
# torch_device = "cpu"
# model_name = "tuner007/pegasus_paraphrase"

# logger.info(f"Loading Pegasus model '{model_name}' for CPU...")
# tokenizer = PegasusTokenizer.from_pretrained(model_name)
# model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device).eval()

# # ----------- Utilities -----------

# def split_into_sentences(text: str) -> List[str]:
#     """Preserve paragraph breaks while tokenizing into sentences."""
#     sentences = []
#     for para in text.split('\n'):
#         if para.strip():
#             sentences.extend(sent_tokenize(para))
#         else:
#             sentences.append('')  # preserve blank lines
#     return sentences

# def chunk_sentence(sentence: str, max_words: int = 50) -> List[str]:
#     """Split very long sentences into smaller word chunks."""
#     words = sentence.split()
#     if len(words) <= max_words:
#         return [sentence]
#     return [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)]

# # ----------- Core Paraphrasing Logic -----------

# async def paraphrase_sentence(sentence: str) -> str:
#     if not sentence.strip():
#         return sentence  # preserve blank lines

#     chunks = chunk_sentence(sentence)
#     rewritten_chunks = []

#     for chunk in chunks:
#         try:
#             inputs = tokenizer(
#                 chunk,
#                 return_tensors="pt",
#                 truncation=True,
#                 max_length=MAX_INPUT_LENGTH,
#             ).to(torch_device)

#             outputs = model.generate(
#                 **inputs,
#                 max_length=MAX_TOKENS,
#                 num_beams=NUM_BEAMS,
#                 early_stopping=True,
#                 do_sample=False,
#             )

#             result = tokenizer.decode(outputs[0], skip_special_tokens=True)

#             # Quality checks
#             if (
#                 result.lower() != chunk.lower()
#                 and len(result.split()) >= max(3, int(len(chunk.split()) * 0.6))
#                 and not any(phrase in result.lower() for phrase in ["is a type of", "are 200", "the man is named"])
#             ):
#                 rewritten_chunks.append(result)
#             else:
#                 logger.warning(f"Low-quality rewrite: '{result}' <- '{chunk}'")
#                 rewritten_chunks.append(chunk)

#         except Exception as e:
#             logger.error(f"Paraphrasing error: {e}")
#             rewritten_chunks.append(chunk)

#     return ' '.join(rewritten_chunks)

# async def paraphrase_paragraph(paragraph: str) -> str:
#     if not paragraph.strip():
#         return paragraph
#     sentences = sent_tokenize(paragraph)
#     rewritten_sentences = await asyncio.gather(*[paraphrase_sentence(s) for s in sentences])
#     return ' '.join(rewritten_sentences)

# async def get_paraphrased_text(text: str) -> str:
#     if not text.strip():
#         return text
#     paragraphs = text.split('\n')
#     rewritten_paragraphs = await asyncio.gather(*[paraphrase_paragraph(p) for p in paragraphs])
#     return '\n'.join(rewritten_paragraphs)


############## update the above code ####################

# import os
# import nltk
# import asyncio
# import torch
# import logging
# from typing import List
# from nltk.tokenize import sent_tokenize
# from transformers import PegasusForConditionalGeneration, PegasusTokenizer

# # Limit CPU threads for performance tuning (especially in Hugging Face 2vCPU env)
# torch.set_num_threads(2)

# # Setup logging
# logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger(__name__)

# # Optional API key
# API_KEY = os.getenv("API_KEY")
# if API_KEY:
#     logger.info("API_KEY loaded successfully.")
# else:
#     logger.warning("API_KEY not found. Continuing without it.")

# # Ensure punkt tokenizer is available
# nltk_data_path = os.getenv("NLTK_DATA", "/app/nltk_data")
# nltk.data.path.append(nltk_data_path)
# try:
#     nltk.data.find("tokenizers/punkt")
# except LookupError:
#     nltk.download("punkt", download_dir=nltk_data_path)

# # Model config
# MAX_TOKENS = 128
# MAX_INPUT_LENGTH = 60
# NUM_BEAMS = 3
# torch_device = "cpu"
# model_name = "tuner007/pegasus_paraphrase"

# # Load tokenizer and model
# logger.info(f"Loading Pegasus model '{model_name}' for CPU...")
# tokenizer = PegasusTokenizer.from_pretrained(model_name)
# model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device).eval()

# # ----------- Utilities -----------

# def split_into_sentences(text: str) -> List[str]:
#     """Preserve paragraph breaks while tokenizing into sentences."""
#     sentences = []
#     for para in text.split('\n'):
#         if para.strip():
#             sentences.extend(sent_tokenize(para))
#         else:
#             sentences.append('')  # preserve blank lines
#     return sentences

# def chunk_sentence(sentence: str, max_words: int = 50) -> List[str]:
#     """Split very long sentences into smaller word chunks."""
#     words = sentence.split()
#     if len(words) <= max_words:
#         return [sentence]
#     return [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)]

# # ----------- Core Paraphrasing Logic -----------

# async def paraphrase_sentence(sentence: str) -> str:
#     """Paraphrase a sentence or short chunk."""
#     if not sentence.strip():
#         return sentence  # Preserve blank lines

#     chunks = chunk_sentence(sentence)
#     rewritten_chunks = []

#     for chunk in chunks:
#         try:
#             inputs = tokenizer(
#                 chunk,
#                 return_tensors="pt",
#                 truncation=True,
#                 max_length=MAX_INPUT_LENGTH,
#             ).to(torch_device)

#             outputs = model.generate(
#                 **inputs,
#                 max_length=MAX_TOKENS,
#                 num_beams=NUM_BEAMS,
#                 early_stopping=True,
#                 do_sample=False,
#             )

#             result = tokenizer.decode(outputs[0], skip_special_tokens=True)

#             # Quality check
#             if (
#                 result.lower() != chunk.lower()
#                 and len(result.split()) >= max(3, int(len(chunk.split()) * 0.6))
#                 and not any(phrase in result.lower() for phrase in ["is a type of", "are 200", "the man is named"])
#             ):
#                 rewritten_chunks.append(result)
#             else:
#                 logger.warning(f"Low-quality rewrite: '{result}' <- '{chunk}'")
#                 rewritten_chunks.append(chunk)

#         except Exception as e:
#             logger.error(f"Paraphrasing error: {e}")
#             rewritten_chunks.append(chunk)

#     return ' '.join(rewritten_chunks)

# async def paraphrase_paragraph(paragraph: str) -> str:
#     """Paraphrase a paragraph by rewriting each sentence."""
#     if not paragraph.strip():
#         return paragraph  # Preserve blank lines
#     sentences = sent_tokenize(paragraph)
#     rewritten_sentences = await asyncio.gather(*[paraphrase_sentence(s) for s in sentences])
#     return ' '.join(rewritten_sentences)

# async def get_paraphrased_text(text: str) -> str:
#     """Main paraphrasing function to handle full texts with paragraph preservation."""
#     if not text.strip():
#         return text
#     paragraphs = text.split('\n')
#     rewritten_paragraphs = await asyncio.gather(*[paraphrase_paragraph(p) for p in paragraphs])
#     return '\n'.join(rewritten_paragraphs)


################# grammer logic add- improve them ##################


import os
import nltk
import asyncio
import torch
import logging
from typing import List
from nltk.tokenize import sent_tokenize
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

# Limit CPU threads for performance tuning (especially in Hugging Face 2vCPU env)
torch.set_num_threads(2)

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Optional API key loading (if needed)
API_KEY = os.getenv("API_KEY")
if API_KEY:
    logger.info("API_KEY loaded successfully.")
else:
    logger.warning("API_KEY not found. Continuing without it.")

# Ensure punkt tokenizer is available
nltk_data_path = os.getenv("NLTK_DATA", "/app/nltk_data")
nltk.data.path.append(nltk_data_path)
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt", download_dir=nltk_data_path)

# Model config
MAX_TOKENS = 128          # Output max tokens
MAX_INPUT_LENGTH = 60     # Input max tokens per chunk (pegasus prefers shorter input chunks)
NUM_BEAMS = 3
torch_device = "cpu"
model_name = "tuner007/pegasus_paraphrase"

# Load tokenizer and model
logger.info(f"Loading Pegasus model '{model_name}' for CPU...")
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device).eval()

# ----------- Utilities -----------

def split_into_sentences(text: str) -> List[str]:
    """Preserve paragraph breaks while tokenizing into sentences."""
    sentences = []
    for para in text.split('\n'):
        if para.strip():
            sentences.extend(sent_tokenize(para))
        else:
            sentences.append('')  # preserve blank lines
    return sentences

def chunk_sentence(sentence: str, max_words: int = 50) -> List[str]:
    """Split very long sentences into smaller word chunks."""
    words = sentence.split()
    if len(words) <= max_words:
        return [sentence]
    return [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)]

def simple_grammar_fix(text: str) -> str:
    """
    Very lightweight grammar fixer to capitalize sentences and fix spacing.
    For production, consider integrating language models or grammar tools.
    """
    # Capitalize first letter of each sentence
    sentences = sent_tokenize(text)
    fixed_sentences = []
    for s in sentences:
        s = s.strip()
        if s:
            s = s[0].upper() + s[1:]
        fixed_sentences.append(s)
    return " ".join(fixed_sentences).replace(" ,", ",").replace(" .", ".").replace(" !", "!").replace(" ?", "?")

# ----------- Core Paraphrasing Logic -----------

async def paraphrase_sentence(sentence: str) -> str:
    """Paraphrase a sentence or short chunk asynchronously."""
    if not sentence.strip():
        return sentence  # Preserve blank lines

    chunks = chunk_sentence(sentence)
    rewritten_chunks = []

    for chunk in chunks:
        try:
            inputs = tokenizer(
                chunk,
                return_tensors="pt",
                truncation=True,
                max_length=MAX_INPUT_LENGTH,
            ).to(torch_device)

            outputs = model.generate(
                **inputs,
                max_length=MAX_TOKENS,
                num_beams=NUM_BEAMS,
                early_stopping=True,
                do_sample=False,
                no_repeat_ngram_size=2,
            )

            result = tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Quality check to avoid bad paraphrases and preserve meaning & length
            if (
                result.lower() != chunk.lower()
                and len(result.split()) >= max(3, int(len(chunk.split()) * 0.6))
                and not any(phrase in result.lower() for phrase in ["is a type of", "are 200", "the man is named"])
            ):
                fixed_result = simple_grammar_fix(result)
                rewritten_chunks.append(fixed_result)
            else:
                logger.warning(f"Low-quality rewrite detected, using original chunk.\nOriginal: {chunk}\nResult: {result}")
                rewritten_chunks.append(chunk)

        except Exception as e:
            logger.error(f"Paraphrasing error: {e}")
            rewritten_chunks.append(chunk)

    return ' '.join(rewritten_chunks)

async def paraphrase_paragraph(paragraph: str) -> str:
    """Paraphrase a paragraph by rewriting each sentence asynchronously."""
    if not paragraph.strip():
        return paragraph  # Preserve blank lines

    sentences = sent_tokenize(paragraph)
    rewritten_sentences = await asyncio.gather(*[paraphrase_sentence(s) for s in sentences])
    return ' '.join(rewritten_sentences)

async def get_paraphrased_text(text: str) -> str:
    """Main paraphrasing function to handle full texts with paragraph preservation asynchronously."""
    if not text.strip():
        return text

    paragraphs = text.split('\n')
    rewritten_paragraphs = await asyncio.gather(*[paraphrase_paragraph(p) for p in paragraphs])
    return '\n'.join(rewritten_paragraphs)

# Example synchronous wrapper (if you want sync calls)
def paraphrase_text_sync(text: str) -> str:
    return asyncio.run(get_paraphrased_text(text))









######------------------------------- add minecraft terms ----------------------------------------------------------------------

# import os
# import nltk
# import torch
# import re
# import logging
# import asyncio
# from nltk.tokenize import sent_tokenize
# from transformers import PegasusForConditionalGeneration, PegasusTokenizer
# from concurrent.futures import ThreadPoolExecutor
# from typing import List, Tuple, Dict

# # Configure logging
# logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger(__name__)

# # NLTK Setup
# nltk_data_path = os.getenv("NLTK_DATA", "/app/nltk_data")
# nltk.data.path.append(nltk_data_path)
# try:
#     nltk.data.find('tokenizers/punkt')
# except LookupError:
#     nltk.download('punkt', download_dir=nltk_data_path)

# # Model Loading with CPU optimization
# pegasus_model = PegasusForConditionalGeneration.from_pretrained(
#     "/app/pegasus_model",
#     low_cpu_mem_usage=True,
#     torch_dtype=torch.float32
# ).eval()
# tokenizer = PegasusTokenizer.from_pretrained("/app/pegasus_model")

# # Configuration
# DYNAMIC_MAX_TOKENS = 768  # Base token length
# ABSOLUTE_MAX = 1024       # For technical descriptions
# NUM_BEAMS = 4             # Improved quality
# BATCH_SIZE = 3            # Optimal for 2vCPU
# MAX_WORKERS = 2           # Matches your 2vCPU
    
# # Dynamic Term Protection System
# def extract_protected_terms(text: str) -> set:
#     """Auto-detect terms to protect from the input text"""
#     protected = set()
    
#     # Extract ALL-CAPS terms and phrases in quotes
#     protected.update(re.findall(r'([A-Z][A-Z0-9_]+(?:\s[A-Z0-9_]+)*)', text))
#     protected.update(re.findall(r'\"([^\"]+)\"', text))
    
#     # Extract noun phrases with 2+ capital letters
#     protected.update(
#         phrase.strip() for phrase in re.findall(r'([A-Z][a-z]+(?:\s[A-Z][a-z]+)+)', text)
#         if len(phrase.split()) > 1
#     )
    
#     return {term.lower() for term in protected}

# # Format Protection Patterns
# FORMAT_PATTERNS = [
#     (r'\*\*(.*?)\*\*', 'BOLD'),          # **bold text**
#     (r'([A-Z]{2,}(?:\s[A-Z0-9_]+)*:)', 'HEADER'),  # HEADERS:
#     (r'\n- (.*?)(\n|$)', 'BULLET'),      # - bullet points
#     (r'`(.*?)`', 'CODE'),                # `code`
#     (r'\"(.*?)\"', 'QUOTE')              # "quoted text"
# ]

# def protect_content(text: str) -> Tuple[str, Dict[str, str]]:
#     """Dynamic content protection"""
#     protected_terms = extract_protected_terms(text)
#     restoration = {}
#     protected_text = text
    
#     # Protect formats
#     for pattern, tag in FORMAT_PATTERNS:
#         for match in re.finditer(pattern, protected_text):
#             placeholder = f"PROTECT_{tag}_{len(restoration)}"
#             protected_text = protected_text.replace(match.group(0), placeholder)
#             restoration[placeholder] = match.group(0)
    
#     # Protect terms (case-insensitive)
#     words = re.split(r'(\W+)', protected_text)
#     for i, word in enumerate(words):
#         lower_word = word.lower()
#         if lower_word in protected_terms:
#             placeholder = f"TERM_{abs(hash(lower_word))}"
#             words[i] = placeholder
#             restoration[placeholder] = word
#     protected_text = ''.join(words)
    
#     return protected_text, restoration

# def restore_content(text: str, restoration: Dict[str, str]) -> str:
#     """Restore protected content"""
#     for placeholder in sorted(restoration.keys(), key=len, reverse=True):
#         text = text.replace(placeholder, restoration[placeholder])
#     return text

# def paraphrase_batch(sentences: List[str]) -> List[str]:
#     """Quality-focused batch processing"""
#     max_len = max(
#         ABSOLUTE_MAX if len(s.split()) > 25 else DYNAMIC_MAX_TOKENS 
#         for s in sentences
#     )
    
#     inputs = tokenizer(
#         sentences,
#         return_tensors="pt",
#         padding=True,
#         truncation=True,
#         max_length=max_len
#     )
    
#     outputs = pegasus_model.generate(
#         **inputs,
#         max_length=max_len + 64,
#         num_beams=NUM_BEAMS,
#         early_stopping=True,
#         temperature=0.8,
#         top_p=0.9,
#         no_repeat_ngram_size=3,
#         length_penalty=1.0,
#         do_sample=False
#     )
#     return tokenizer.batch_decode(outputs, skip_special_tokens=True)

# async def process_paragraph(paragraph: str) -> str:
#     """Paragraph processing pipeline"""
#     if not paragraph.strip():
#         return paragraph
    
#     try:
#         protected, restoration = protect_content(paragraph)
#         sentences = sent_tokenize(protected)
        
#         with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
#             batches = [sentences[i:i+BATCH_SIZE] for i in range(0, len(sentences), BATCH_SIZE)]
#             results = []
#             for batch in batches:
#                 results.extend(paraphrase_batch(batch))
        
#         return restore_content(' '.join(results), restoration)
    
#     except Exception as e:
#         logger.error(f"Paragraph processing failed: {e}")
#         return paragraph

# async def get_paraphrased_text(text: str) -> str:
#     """Main processing function"""
#     paragraphs = [p for p in text.split('\n') if p.strip() or p == '']
#     processed = await asyncio.gather(*[process_paragraph(p) for p in paragraphs])
#     return '\n'.join(processed)