File size: 5,741 Bytes
7dadc22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
"""
Generic utility functions used across the application.
"""
import random
import re


def generate_seed():
    """Generate a random positive 32-bit integer seed."""
    return random.randint(0, 2**32 - 1)


def sanitize_yaml_response(response_text: str) -> str:
    """
    Sanitize and format AI response into valid YAML.
    Returns properly formatted YAML string.
    """
    
    # Pre-processing: Remove code block markers
    if response_text.startswith("```yaml"):
        # Remove the "```yaml" at the beginning and closing ```
        response_text = response_text[7:]  # Remove "```yaml" (7 characters)
        if response_text.endswith("```"):
            response_text = response_text[:-3]  # Remove closing ```
        response_text = response_text.strip()
    elif response_text.startswith("```"):
        # Remove the "```" at the beginning and closing ```
        response_text = response_text[3:]  # Remove opening ```
        if response_text.endswith("```"):
            response_text = response_text[:-3]  # Remove closing ```
        response_text = response_text.strip()
    
    # Handle edge case where the LLM might have continued the prompt
    # e.g., if the response starts with the incomplete prompt we provided
    if response_text.startswith('title: \\"'):
        # Remove the incomplete prompt prefix
        response_text = response_text[9:].strip()
    
    # Check if it already has a proper YAML structure
    if not response_text.startswith(('title:', 'title :')):
        # Only wrap with title if it doesn't already have one
        # The sanitize function will handle escaping
        response_text = f'title: {response_text}'

    # Split on first occurrence of ``` to handle any remaining code blocks
    response_text = response_text.split("```")[0]

    # Remove any markdown code block indicators and YAML document markers
    clean_text = re.sub(r'```yaml|```|---|\.\.\.$', '', response_text.strip())
    
    # Handle the specific case where LLM duplicates 'title:' in the value
    # e.g., title: "title: "Something"" -> title: "Something"
    clean_text = re.sub(r'title:\s*"title:\s*"([^"]+)""?', r'title: "\1"', clean_text)
    clean_text = re.sub(r'title:\s*\'title:\s*\'([^\']+)\'\'?', r'title: \'\1\'', clean_text)
    clean_text = re.sub(r'title:\s*"title:\s*\'([^\']+)\'"?', r'title: "\1"', clean_text)
    clean_text = re.sub(r'title:\s*\'title:\s*"([^"]+)"\'?', r'title: \'\1\'', clean_text)
    
    # Also handle case where title appears twice without quotes
    clean_text = re.sub(r'title:\s*title:\s*(.+)$', r'title: \1', clean_text, flags=re.MULTILINE)
    
    # Split into lines and process each line
    lines = clean_text.split('\n')
    sanitized_lines = []
    current_field = None
    
    for line in lines:
        stripped = line.strip()
        if not stripped:
            continue
            
        # Handle field starts
        if stripped.startswith('title:') or stripped.startswith('description:'):
            # Ensure proper YAML format with space after colon and proper quoting
            field_name = stripped.split(':', 1)[0]
            field_value = stripped.split(':', 1)[1].strip()
            
            # Remove outer quotes first
            if (field_value.startswith('"') and field_value.endswith('"')) or \
               (field_value.startswith("'") and field_value.endswith("'")):
                field_value = field_value[1:-1]
            
            # Check for nested title pattern again (in case it wasn't caught by regex)
            if field_name == 'title' and field_value.lower().startswith('title:'):
                # Remove the nested 'title:' prefix
                field_value = field_value[6:].strip().strip('"\'')
            
            # Escape any internal quotes
            field_value = field_value.replace('"', '\\"')
            
            # Always quote the value to ensure proper YAML formatting
            field_value = f'"{field_value}"'
                
            sanitized_lines.append(f"{field_name}: {field_value}")
            current_field = field_name
            
        elif stripped.startswith('tags:'):
            sanitized_lines.append('tags:')
            current_field = 'tags'
            
        elif stripped.startswith('-') and current_field == 'tags':
            # Process tag values
            tag = stripped[1:].strip().strip('"\'')
            if tag:
                # Clean and format tag
                tag = re.sub(r'[^\x00-\x7F]+', '', tag)  # Remove non-ASCII
                tag = re.sub(r'[^a-zA-Z0-9\s-]', '', tag)  # Keep only alphanumeric and hyphen
                tag = tag.strip().lower().replace(' ', '-')
                if tag:
                    sanitized_lines.append(f"  - {tag}")
                    
        elif current_field in ['title', 'description']:
            # Handle multi-line title/description continuation
            value = stripped.strip('"\'')
            if value:
                # Append to previous line (but within the quotes)
                prev = sanitized_lines[-1]
                # Remove the closing quote, append the value, and add the quote back
                if prev.endswith('"'):
                    sanitized_lines[-1] = f'{prev[:-1]} {value}"'
    
    # Ensure the YAML has all required fields
    required_fields = {'title', 'description', 'tags'}
    found_fields = {line.split(':')[0].strip() for line in sanitized_lines if ':' in line}
    
    for field in required_fields - found_fields:
        if field == 'tags':
            sanitized_lines.extend(['tags:', '  - default'])
        else:
            sanitized_lines.append(f'{field}: "No {field} provided"')
    
    return '\n'.join(sanitized_lines)