Spaces:

KalbeDigitalLab
/

NutriGenMePE

Build error

App Files Files Community

NutriGenMePE / utils.py

firqaaa

Upload 6 files

eb88b82 about 2 years ago

raw

history blame contribute delete

3.17 kB

	import os
	import shutil
	import textwrap

	import nltk
	import re
	from Bio import Entrez


	def replace_quotes(text):
	pattern = r'(?<=")[^"]*(?=")'
	return re.sub(pattern, lambda match: match.group(0).replace('"', "'"), text)


	def clean_text(text):
	"""Remove section titles and figure descriptions from text"""
	pattern = r'[^\w\s]'
	clean = "\n".join([row for row in text.split("\n") if (len(row.split(" "))) > 3 and not (row.startswith("(a)")) and not row.startswith("Figure")])
	return re.sub(pattern, '', clean)


	def truncate_text(text, max_tokens):
	wrapper = textwrap.TextWrapper(width=max_tokens)
	truncated_text = wrapper.wrap(text)
	if len(truncated_text) > 0:
	return truncated_text[0]
	else:
	return ""


	def split_text(text, chunk_size):
	chunks = []
	start = 0
	end = chunk_size
	while start < len(text):
	chunks.append(text[start:end])
	start = end
	end += chunk_size
	return chunks


	def extract_gene_name(text):

	text_str = text.decode("utf-8")
	text_str = text_str.replace("\\n", "").replace("\\t", "").replace("\\'", "'")
	pattern = r"<NAME>(.*?)</NAME>"
	match = re.search(pattern, text_str)
	if match:
	gene_name = match.group(1)
	return gene_name
	else:
	return None


	def get_geneName(rsid):

	text = Entrez.efetch(db="snp", id=rsid, retmode='xml').read()
	text = extract_gene_name(text)
	return text


	def split_text_into_sentences(text, num_sentences):

	sentences = nltk.sent_tokenize(text)
	grouped_sentences = [sentences[i:i+num_sentences] for i in range(0, len(sentences), num_sentences)]
	return grouped_sentences


	def flatten_list(nested_list):

	flattened_list = []
	for item in nested_list:
	if isinstance(item, list):
	flattened_list.extend(flatten_list(item))
	else:
	flattened_list.append(item)
	return flattened_list


	def move_file(source_path, destination_path):

	if not os.path.exists(destination_path):
	os.makedirs(destination_path)

	try:
	shutil.move(source_path, destination_path)
	print(f"File moved successfully from '{source_path}' to '{destination_path}'.")
	except Exception as e:
	print(f"Error: {e}")


	def upper_abbreviation(text):
	pattern1 = r'\b(?:[A-Z][a-z.]\.?\s)+\b'
	pattern2 = re.compile(r'unknown', re.IGNORECASE)
	def convert_to_upper(match):
	return match.group(0).replace('.', '').upper()
	text = re.sub(pattern2, '', text)
	output_string = re.sub(pattern1, convert_to_upper, text)
	return output_string


	def get_valid_year(input_text):
	four_letter_words = re.findall(r'\b\w{4}\b', input_text)
	result_text = ' '.join(four_letter_words)
	if len(result_text.split(' ')) > 1:
	return ''.join(result_text.split(' ')[0])
	return result_text


	def sample_size_postproc(text):
	words = text.split()
	pattern = r'\b[A-Za-z]+\d+\b'
	cleaned_words = [word for word in words if not re.match(r'.\d.[A-Za-z].*$', word)]
	cleaned_text = ' '.join(cleaned_words)
	cleaned_text = re.sub(pattern, '', cleaned_text)
	return cleaned_text