Spaces:

TeacherPuffy
/

CreateBookPackage

Sleeping

App Files Files Community

CreateBookPackage / app.py

TeacherPuffy

Update app.py

3cf0646 verified 12 months ago

raw

history blame contribute delete

3.94 kB

	import gradio as gr
	from datasets import load_dataset
	import tempfile
	import re

	# List of common titles that end with a period
	TITLES = {"Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Rev.", "Sr.", "Jr."}

	def is_latin(text):
	"""Check if the text contains only Latin characters."""
	# Regex to match non-Latin characters
	return not re.search(r'[^\x00-\x7F]', text)

	def clean_text(text):
	"""Remove non-Latin text and ** from the text."""
	# Remove **
	text = re.sub(r'\\', '', text)

	# Split text into sentences and filter out non-Latin sentences
	sentences = re.split(r'(?<=[.!?])\s+', text)
	cleaned_sentences = [s for s in sentences if is_latin(s)]

	return ' '.join(cleaned_sentences)

	def process_text(text):
	"""Insert a newline after periods, except for titles and ."
	Also replace '### Simplified Version' with 'Chapter N' where N increments."""
	# Split text into words
	words = text.split()
	processed_text = ""
	chapter_counter = 3 # Initialize chapter counter

	for i, word in enumerate(words):
	# Check if the word is a title (e.g., Mr., Mrs.)
	if word in TITLES:
	processed_text += word + " "
	# Check if the word ends with a period and is not followed by a quote
	elif word.endswith('.') and not word.endswith('."'):
	processed_text += word + "\n"
	# Replace '### Simplified Version' with 'Chapter N'
	elif word == "###" and i + 2 < len(words) and words[i + 1] == "Simplified" and words[i + 2] == "Version":
	processed_text += f"Chapter {chapter_counter} "
	chapter_counter += 1 # Increment chapter counter
	words[i + 1] = "" # Skip the next two words
	words[i + 2] = ""
	else:
	processed_text += word + " "

	# Remove trailing spaces and newlines
	return processed_text.strip()

	def combine_dataset_texts(dataset_name, split, text_column):
	try:
	# Load the dataset from Hugging Face Hub
	dataset = load_dataset(dataset_name, split=split)

	# Verify the text column exists
	if text_column not in dataset.column_names:
	raise gr.Error(f"Column '{text_column}' not found in dataset")

	# Combine all texts into a single string without separating datapoints
	combined_text = " ".join([example[text_column] for example in dataset])

	# Clean the text: remove non-Latin and **
	cleaned_text = clean_text(combined_text)

	# Process the text: insert newlines after periods, except for titles and ."
	# Also replace '### Simplified Version' with 'Chapter N'
	processed_text = process_text(cleaned_text)

	# Create a temporary file
	with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f:
	f.write(processed_text)
	return f.name

	except Exception as e:
	raise gr.Error(f"Error processing dataset: {str(e)}")

	# Create Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("## Hugging Face Dataset Text Combiner")
	gr.Markdown("Combine all text files from a Hugging Face dataset into a single file")

	with gr.Row():
	dataset_input = gr.Textbox(label="Dataset Name",
	placeholder="username/dataset-name")
	split_input = gr.Textbox(label="Split", value="train")
	column_input = gr.Textbox(label="Text Column", value="text")

	submit_btn = gr.Button("Combine Texts")

	with gr.Row():
	output_file = gr.File(label="Combined Text File")
	error_out = gr.Textbox(label="Error Output", visible=False)

	submit_btn.click(
	fn=combine_dataset_texts,
	inputs=[dataset_input, split_input, column_input],
	outputs=output_file,
	api_name="combine_texts"
	)

	if __name__ == "__main__":
	demo.launch()