Spaces:

jbilcke-hf
/

tikslop

Running on CPU Upgrade

File size: 5,741 Bytes

7dadc22

"""
Generic utility functions used across the application.
"""
import random
import re


def generate_seed():
    """Generate a random positive 32-bit integer seed."""
    return random.randint(0, 2**32 - 1)


def sanitize_yaml_response(response_text: str) -> str:
    """
    Sanitize and format AI response into valid YAML.
    Returns properly formatted YAML string.
    """
    
    # Pre-processing: Remove code block markers
    if response_text.startswith("```yaml"):
        # Remove the "```yaml" at the beginning and closing ```
        response_text = response_text[7:]  # Remove "```yaml" (7 characters)
        if response_text.endswith("```"):
            response_text = response_text[:-3]  # Remove closing ```
        response_text = response_text.strip()
    elif response_text.startswith("```"):
        # Remove the "```" at the beginning and closing ```
        response_text = response_text[3:]  # Remove opening ```
        if response_text.endswith("```"):
            response_text = response_text[:-3]  # Remove closing ```
        response_text = response_text.strip()
    
    # Handle edge case where the LLM might have continued the prompt
    # e.g., if the response starts with the incomplete prompt we provided
    if response_text.startswith('title: \\"'):
        # Remove the incomplete prompt prefix
        response_text = response_text[9:].strip()
    
    # Check if it already has a proper YAML structure
    if not response_text.startswith(('title:', 'title :')):
        # Only wrap with title if it doesn't already have one
        # The sanitize function will handle escaping
        response_text = f'title: {response_text}'

    # Split on first occurrence of ``` to handle any remaining code blocks
    response_text = response_text.split("```")[0]

    # Remove any markdown code block indicators and YAML document markers
    clean_text = re.sub(r'```yaml|```|---|\.\.\.$', '', response_text.strip())
    
    # Handle the specific case where LLM duplicates 'title:' in the value
    # e.g., title: "title: "Something"" -> title: "Something"
    clean_text = re.sub(r'title:\s*"title:\s*"([^"]+)""?', r'title: "\1"', clean_text)
    clean_text = re.sub(r'title:\s*\'title:\s*\'([^\']+)\'\'?', r'title: \'\1\'', clean_text)
    clean_text = re.sub(r'title:\s*"title:\s*\'([^\']+)\'"?', r'title: "\1"', clean_text)
    clean_text = re.sub(r'title:\s*\'title:\s*"([^"]+)"\'?', r'title: \'\1\'', clean_text)
    
    # Also handle case where title appears twice without quotes
    clean_text = re.sub(r'title:\s*title:\s*(.+)$', r'title: \1', clean_text, flags=re.MULTILINE)
    
    # Split into lines and process each line
    lines = clean_text.split('\n')
    sanitized_lines = []
    current_field = None
    
    for line in lines:
        stripped = line.strip()
        if not stripped:
            continue
            
        # Handle field starts
        if stripped.startswith('title:') or stripped.startswith('description:'):
            # Ensure proper YAML format with space after colon and proper quoting
            field_name = stripped.split(':', 1)[0]
            field_value = stripped.split(':', 1)[1].strip()
            
            # Remove outer quotes first
            if (field_value.startswith('"') and field_value.endswith('"')) or \
               (field_value.startswith("'") and field_value.endswith("'")):
                field_value = field_value[1:-1]
            
            # Check for nested title pattern again (in case it wasn't caught by regex)
            if field_name == 'title' and field_value.lower().startswith('title:'):
                # Remove the nested 'title:' prefix
                field_value = field_value[6:].strip().strip('"\'')
            
            # Escape any internal quotes
            field_value = field_value.replace('"', '\\"')
            
            # Always quote the value to ensure proper YAML formatting
            field_value = f'"{field_value}"'
                
            sanitized_lines.append(f"{field_name}: {field_value}")
            current_field = field_name
            
        elif stripped.startswith('tags:'):
            sanitized_lines.append('tags:')
            current_field = 'tags'
            
        elif stripped.startswith('-') and current_field == 'tags':
            # Process tag values
            tag = stripped[1:].strip().strip('"\'')
            if tag:
                # Clean and format tag
                tag = re.sub(r'[^\x00-\x7F]+', '', tag)  # Remove non-ASCII
                tag = re.sub(r'[^a-zA-Z0-9\s-]', '', tag)  # Keep only alphanumeric and hyphen
                tag = tag.strip().lower().replace(' ', '-')
                if tag:
                    sanitized_lines.append(f"  - {tag}")
                    
        elif current_field in ['title', 'description']:
            # Handle multi-line title/description continuation
            value = stripped.strip('"\'')
            if value:
                # Append to previous line (but within the quotes)
                prev = sanitized_lines[-1]
                # Remove the closing quote, append the value, and add the quote back
                if prev.endswith('"'):
                    sanitized_lines[-1] = f'{prev[:-1]} {value}"'
    
    # Ensure the YAML has all required fields
    required_fields = {'title', 'description', 'tags'}
    found_fields = {line.split(':')[0].strip() for line in sanitized_lines if ':' in line}
    
    for field in required_fields - found_fields:
        if field == 'tags':
            sanitized_lines.extend(['tags:', '  - default'])
        else:
            sanitized_lines.append(f'{field}: "No {field} provided"')
    
    return '\n'.join(sanitized_lines)