--- license: apache-2.0 datasets: - agentlans/high-quality-english-sentences language: - en base_model: - google-t5/t5-base pipeline_tag: text2text-generation library_name: transformers --- This model is for typos in texts and it outputs corrected texts. Example: Text with Typos: **Whathvhr wh call owr carhaivhrs - doctors, nwrsh practitionhrs, clinicians, - wh nhhd thhm not only to carh, wh nhhd thhm to uh aulh to providh thh riaht valwh.** Corrected Text: **Whatever we call our caregivers - doctors, nurse practitioners, clinicians, - we need them not only to care, we need them to be able to provide the right value.** Example Usage: ```py #Load the model and tokenizer text = "" #Text with typos here! inputs = tokenizer(cipher_text, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device) outputs = model.generate(inputs["input_ids"], max_length=256) corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True) ``` Full Pipeline Usage: ```py from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import torch from string import ascii_lowercase import Levenshtein import random device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer = AutoTokenizer.from_pretrained("Cipher-AI/Substitution-Cipher-Alphabet-Eng") alphabet_model = AutoModelForSeq2SeqLM.from_pretrained("Cipher-AI/Substitution-Cipher-Alphabet-Eng").to(device) correction_model = AutoModelForSeq2SeqLM.from_pretrained("Cipher-AI/AutoCorrect-EN-v2").to(device) def similarity_percentage(s1, s2): distance = Levenshtein.distance(s1, s2) max_len = max(len(s1), len(s2)) similarity = (1 - distance / max_len) * 100 return similarity def decode(cipher_text, key): decipher_map = {ascii_lowercase[i]: j for i, j in enumerate(key[:26])} decipher_map.update({ascii_lowercase[i].upper(): j.upper() for i, j in enumerate(key[:26])}) ans = ''.join(map(lambda x: decipher_map[x] if x in decipher_map else x, cipher_text)) return ans def model_pass(model, input, max_length=256): inputs = tokenizer(input, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device) outputs = model.generate(inputs["input_ids"], max_length=max_length) result = tokenizer.decode(outputs[0], skip_special_tokens=True) return result def decipher(cipher_text, key) -> str: decipher_map = {ascii_lowercase[i]: j for i, j in enumerate(key[0])} decipher_map.update({ascii_lowercase[i].upper(): j.upper() for i, j in enumerate(key[0])}) result = ''.join(map(lambda x: decipher_map[x] if x in decipher_map else x, cipher_text[0])) return result def cipher(plain_text) -> tuple[str, list]: alphabet_map = list(ascii_lowercase) random.shuffle(alphabet_map) alphabet_map = {i : j for i, j in zip(ascii_lowercase, alphabet_map)} alphabet_map.update({i.upper() : j.upper() for i, j in alphabet_map.items()}) cipher_text = ''.join(map(lambda x: alphabet_map[x] if x in alphabet_map else x, plain_text)) return cipher_text, alphabet_map def correct_text(cipher_text, model_output): cipher_text = cipher_text.split(' ') model_output = model_output.split(' ') letter_map = {i: {j: 0 for j in ascii_lowercase} for i in ascii_lowercase} # Levenstein distance for lenghts of words n = len(cipher_text) m = len(model_output) i = 0 j = 0 dp = [[0 for _ in range(m + 1)] for _ in range(n + 1)] for i in range(n + 1): dp[i][0] = i for j in range(m + 1): dp[0][j] = j for i in range(1, n + 1): for j in range(1, m + 1): if len(cipher_text[i - 1]) == len(model_output[j - 1]): dp[i][j] = dp[i - 1][j - 1] else: dp[i][j] = min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1]) + 1 i = n j = m while i > 0 and j > 0: before = min([(0, dp[i - 1][j - 1]), (1, dp[i - 1][j]), (2, dp[i][j - 1])], key=lambda x: x[1]) match before[0]: case 0: if dp[i - 1][j - 1] == dp[i][j]: # If the same we add them to letter map cipher = cipher_text[i-1] model_o = model_output[j-1] for c_letter, m_letter in zip(cipher.lower(), model_o.lower()): if c_letter in letter_map and m_letter in letter_map[c_letter]: letter_map[c_letter][m_letter] += 1 i = i - 1 j = j - 1 case 1: i = i - 1 case 2: j = j - 1 for letter in ascii_lowercase: letter_sum = sum(letter_map[letter].values()) if letter_sum == 0: # That letter wasn't in the text letter_map[letter] = None continue # Sorted from most accuring to least letter_map[letter] = [(k, v / letter_sum) for k, v in sorted(letter_map[letter].items(), key=lambda item: item[1], reverse=True)] change_map = { i : None for i in ascii_lowercase } for i in range(len(ascii_lowercase)): for letter in ascii_lowercase: if letter_map[letter] is None: continue # That letter wasn't in the text # If None then it didn't get substituted earlier map_letter = letter_map[letter][i][0] if (letter_map[letter][i][1] > 0 and (change_map[map_letter] is None or (change_map[map_letter][2] < letter_map[letter][i][1] and change_map[map_letter][1] >= i))): change_map[map_letter] = (letter, i, letter_map[letter][i][1]) # Letter, iteration, percentage change_map = {i[1][0]: i[0] for i in change_map.items() if i[1] is not None} for letter in ascii_lowercase: if letter not in change_map: change_map[letter] = '.' # Add uppercases change_map.update( { i[0].upper() : i[1].upper() for i in change_map.items() } ) new_text = [] for cipher in cipher_text: new_word = "" for c_letter in cipher: if c_letter in change_map: new_word += change_map[c_letter] else: new_word += c_letter new_text.append(new_word) return ' '.join(new_text) def crack_sub(cipher_text): output = model_pass(alphabet_model, cipher_text, 26) decoded = decode(cipher_text, output) second_pass = model_pass(correction_model, decoded, len(decoded)) second_text = correct_text(cipher_text, second_pass) third_pass = model_pass(correction_model, second_text, len(decoded)) return third_pass """ Use crack_sub() function to solve monoalphabetic substitution ciphers! """ ```