Spaces:
Build error
Build error
| import os | |
| import shutil | |
| import textwrap | |
| import nltk | |
| import re | |
| from Bio import Entrez | |
| def replace_quotes(text): | |
| pattern = r'(?<=")[^"]*(?=")' | |
| return re.sub(pattern, lambda match: match.group(0).replace('"', "'"), text) | |
| def clean_text(text): | |
| """Remove section titles and figure descriptions from text""" | |
| pattern = r'[^\w\s]' | |
| clean = "\n".join([row for row in text.split("\n") if (len(row.split(" "))) > 3 and not (row.startswith("(a)")) and not row.startswith("Figure")]) | |
| return re.sub(pattern, '', clean) | |
| def truncate_text(text, max_tokens): | |
| wrapper = textwrap.TextWrapper(width=max_tokens) | |
| truncated_text = wrapper.wrap(text) | |
| if len(truncated_text) > 0: | |
| return truncated_text[0] | |
| else: | |
| return "" | |
| def split_text(text, chunk_size): | |
| chunks = [] | |
| start = 0 | |
| end = chunk_size | |
| while start < len(text): | |
| chunks.append(text[start:end]) | |
| start = end | |
| end += chunk_size | |
| return chunks | |
| def extract_gene_name(text): | |
| text_str = text.decode("utf-8") | |
| text_str = text_str.replace("\\n", "").replace("\\t", "").replace("\\'", "'") | |
| pattern = r"<NAME>(.*?)</NAME>" | |
| match = re.search(pattern, text_str) | |
| if match: | |
| gene_name = match.group(1) | |
| return gene_name | |
| else: | |
| return None | |
| def get_geneName(rsid): | |
| text = Entrez.efetch(db="snp", id=rsid, retmode='xml').read() | |
| text = extract_gene_name(text) | |
| return text | |
| def split_text_into_sentences(text, num_sentences): | |
| sentences = nltk.sent_tokenize(text) | |
| grouped_sentences = [sentences[i:i+num_sentences] for i in range(0, len(sentences), num_sentences)] | |
| return grouped_sentences | |
| def flatten_list(nested_list): | |
| flattened_list = [] | |
| for item in nested_list: | |
| if isinstance(item, list): | |
| flattened_list.extend(flatten_list(item)) | |
| else: | |
| flattened_list.append(item) | |
| return flattened_list | |
| def move_file(source_path, destination_path): | |
| if not os.path.exists(destination_path): | |
| os.makedirs(destination_path) | |
| try: | |
| shutil.move(source_path, destination_path) | |
| print(f"File moved successfully from '{source_path}' to '{destination_path}'.") | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| def upper_abbreviation(text): | |
| pattern1 = r'\b(?:[A-Z][a-z.]*\.?\s*)+\b' | |
| pattern2 = re.compile(r'unknown', re.IGNORECASE) | |
| def convert_to_upper(match): | |
| return match.group(0).replace('.', '').upper() | |
| text = re.sub(pattern2, '', text) | |
| output_string = re.sub(pattern1, convert_to_upper, text) | |
| return output_string | |
| def get_valid_year(input_text): | |
| four_letter_words = re.findall(r'\b\w{4}\b', input_text) | |
| result_text = ' '.join(four_letter_words) | |
| if len(result_text.split(' ')) > 1: | |
| return ''.join(result_text.split(' ')[0]) | |
| return result_text | |
| def sample_size_postproc(text): | |
| words = text.split() | |
| pattern = r'\b[A-Za-z]+\d+\b' | |
| cleaned_words = [word for word in words if not re.match(r'.*\d.*[A-Za-z].*$', word)] | |
| cleaned_text = ' '.join(cleaned_words) | |
| cleaned_text = re.sub(pattern, '', cleaned_text) | |
| return cleaned_text |