Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from datasets import load_dataset | |
| import tempfile | |
| import re | |
| # List of common titles that end with a period | |
| TITLES = {"Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Rev.", "Sr.", "Jr."} | |
| def is_latin(text): | |
| """Check if the text contains only Latin characters.""" | |
| # Regex to match non-Latin characters | |
| return not re.search(r'[^\x00-\x7F]', text) | |
| def clean_text(text): | |
| """Remove non-Latin text and ** from the text.""" | |
| # Remove ** | |
| text = re.sub(r'\*\*', '', text) | |
| # Split text into sentences and filter out non-Latin sentences | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| cleaned_sentences = [s for s in sentences if is_latin(s)] | |
| return ' '.join(cleaned_sentences) | |
| def process_text(text): | |
| """Insert a newline after periods, except for titles and ." | |
| Also replace '### Simplified Version' with 'Chapter N' where N increments.""" | |
| # Split text into words | |
| words = text.split() | |
| processed_text = "" | |
| chapter_counter = 3 # Initialize chapter counter | |
| for i, word in enumerate(words): | |
| # Check if the word is a title (e.g., Mr., Mrs.) | |
| if word in TITLES: | |
| processed_text += word + " " | |
| # Check if the word ends with a period and is not followed by a quote | |
| elif word.endswith('.') and not word.endswith('."'): | |
| processed_text += word + "\n" | |
| # Replace '### Simplified Version' with 'Chapter N' | |
| elif word == "###" and i + 2 < len(words) and words[i + 1] == "Simplified" and words[i + 2] == "Version": | |
| processed_text += f"Chapter {chapter_counter} " | |
| chapter_counter += 1 # Increment chapter counter | |
| words[i + 1] = "" # Skip the next two words | |
| words[i + 2] = "" | |
| else: | |
| processed_text += word + " " | |
| # Remove trailing spaces and newlines | |
| return processed_text.strip() | |
| def combine_dataset_texts(dataset_name, split, text_column): | |
| try: | |
| # Load the dataset from Hugging Face Hub | |
| dataset = load_dataset(dataset_name, split=split) | |
| # Verify the text column exists | |
| if text_column not in dataset.column_names: | |
| raise gr.Error(f"Column '{text_column}' not found in dataset") | |
| # Combine all texts into a single string without separating datapoints | |
| combined_text = " ".join([example[text_column] for example in dataset]) | |
| # Clean the text: remove non-Latin and ** | |
| cleaned_text = clean_text(combined_text) | |
| # Process the text: insert newlines after periods, except for titles and ." | |
| # Also replace '### Simplified Version' with 'Chapter N' | |
| processed_text = process_text(cleaned_text) | |
| # Create a temporary file | |
| with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f: | |
| f.write(processed_text) | |
| return f.name | |
| except Exception as e: | |
| raise gr.Error(f"Error processing dataset: {str(e)}") | |
| # Create Gradio interface | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## Hugging Face Dataset Text Combiner") | |
| gr.Markdown("Combine all text files from a Hugging Face dataset into a single file") | |
| with gr.Row(): | |
| dataset_input = gr.Textbox(label="Dataset Name", | |
| placeholder="username/dataset-name") | |
| split_input = gr.Textbox(label="Split", value="train") | |
| column_input = gr.Textbox(label="Text Column", value="text") | |
| submit_btn = gr.Button("Combine Texts") | |
| with gr.Row(): | |
| output_file = gr.File(label="Combined Text File") | |
| error_out = gr.Textbox(label="Error Output", visible=False) | |
| submit_btn.click( | |
| fn=combine_dataset_texts, | |
| inputs=[dataset_input, split_input, column_input], | |
| outputs=output_file, | |
| api_name="combine_texts" | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |