Spaces:

deddoggo
/

traffic-rules-chatbot

Paused

traffic-rules-chatbot / utils.py

innit

25950e2 2 months ago

1.5 kB

	# utils.py

	import re
	from typing import List

	def tokenize_vi_simple(text: str) -> List[str]:
	"""
	Tokenizes Vietnamese text simply for tasks like BM25.
	Converts to lowercase, removes basic punctuation, and splits by whitespace.

	Args:
	text (str): The input Vietnamese text.

	Returns:
	List[str]: A list of tokens.
	"""
	if not isinstance(text, str):
	# Or raise TypeError("Input must be a string")
	return []
	text = text.lower()
	# Remove characters that are not alphanumeric or whitespace
	text = re.sub(r'[^\w\s]', '', text)
	return text.split()

	# You can add other general utility functions here as your project grows.
	# For example:
	# - Functions for logging
	# - Functions for path manipulation if they are used across multiple modules
	# - Simple data validation or cleaning routines not specific to law data or LLMs

	if __name__ == '__main__':
	print("Testing utils.py...")

	# Test tokenize_vi_simple
	print("\n--- Test tokenize_vi_simple ---")
	test_phrases = [
	"Luật Giao thông Đường bộ Việt Nam 2023!",
	"Xe ô tô con và xe máy.",
	" Phạt tiền từ 200.000đ đến 400.000đ. ",
	"",
	None, # Test with None
	123 # Test with non-string
	]
	for phrase in test_phrases:
	print(f"Input: '{phrase}' (type: {type(phrase).__name__})")
	tokens = tokenize_vi_simple(phrase)
	print(f"Tokens: {tokens}")
	print("-" * 10)