Tools / image /remove_metadata.py
jebin2's picture
remove metadata issue fix
c8ef5ef
"""
Image Metadata Remover Module
This module provides a comprehensive class-based metadata remover that strips ALL metadata
from images while preserving original quality, format, and aspect ratio.
Handles EXIF, IPTC, XMP, ICC profiles, and all other embedded metadata for security/privacy.
"""
from .image_base import ImageBase
import os
from custom_logger import logger_config
from PIL import Image
from PIL.ExifTags import TAGS, GPSTAGS
from typing import Dict
import base64
class RemoveMetadata(ImageBase):
"""
Comprehensive image metadata remover for security and privacy.
Removes ALL metadata types including EXIF, IPTC, XMP, ICC profiles, thumbnails,
GPS data, camera settings, and any other embedded information.
"""
def __init__(self):
super().__init__("remove_metadata")
def _extract_all_metadata(self, image: Image.Image):
"""
Extract ALL available metadata from the image for security analysis.
Args:
image: PIL Image object
Returns:
Dictionary containing comprehensive metadata extraction
"""
all_metadata = {}
# 1. Extract EXIF data (most common and detailed)
try:
if hasattr(image, '_getexif') and image._getexif():
exif_dict = image._getexif()
all_metadata['exif_raw'] = exif_dict
# Decode EXIF tags to human-readable format
exif_decoded = {}
for tag_id, value in exif_dict.items():
tag = TAGS.get(tag_id, tag_id)
# Special handling for GPS data
if tag == 'GPSInfo':
gps_data = {}
for gps_tag_id, gps_value in value.items():
gps_tag = GPSTAGS.get(gps_tag_id, gps_tag_id)
gps_data[gps_tag] = gps_value
exif_decoded[tag] = gps_data
else:
exif_decoded[tag] = value
all_metadata['exif_decoded'] = exif_decoded
except Exception as e:
all_metadata['exif_error'] = str(e)
# 2. Extract all image.info metadata (includes IPTC, XMP, ICC, etc.)
if hasattr(image, 'info') and image.info:
info_data = {}
for key, value in image.info.items():
# Handle binary data by encoding it
if isinstance(value, bytes):
try:
# Try to decode as text first
info_data[key] = value.decode('utf-8', errors='ignore')
except:
# If that fails, encode as base64 for viewing
info_data[key] = f"<binary_data_base64>{base64.b64encode(value[:100]).decode()}</binary_data_base64>"
else:
info_data[key] = value
all_metadata['info'] = info_data
# 3. Extract ICC Profile (color management)
try:
if hasattr(image, 'info') and 'icc_profile' in image.info:
icc_profile = image.info['icc_profile']
all_metadata['icc_profile_size'] = len(icc_profile) if icc_profile else 0
all_metadata['icc_profile_present'] = bool(icc_profile)
except Exception as e:
all_metadata['icc_error'] = str(e)
# 4. Extract XMP metadata
try:
if hasattr(image, 'info') and 'xmp' in image.info:
xmp_data = image.info['xmp']
if isinstance(xmp_data, bytes):
all_metadata['xmp'] = xmp_data.decode('utf-8', errors='ignore')
else:
all_metadata['xmp'] = xmp_data
except Exception as e:
all_metadata['xmp_error'] = str(e)
# 5. Extract IPTC metadata
try:
if hasattr(image, 'info') and any(key.startswith('iptc') for key in image.info.keys()):
iptc_data = {k: v for k, v in image.info.items() if k.startswith('iptc')}
all_metadata['iptc'] = iptc_data
except Exception as e:
all_metadata['iptc_error'] = str(e)
# 6. Extract PIL tag and tag_v2 (alternative EXIF access)
try:
if hasattr(image, 'tag') and image.tag:
all_metadata['pil_tag'] = dict(image.tag)
except Exception as e:
all_metadata['pil_tag_error'] = str(e)
try:
if hasattr(image, 'tag_v2') and image.tag_v2:
all_metadata['pil_tag_v2'] = dict(image.tag_v2)
except Exception as e:
all_metadata['pil_tag_v2_error'] = str(e)
# 7. Extract quantization tables (JPEG compression info)
try:
if hasattr(image, 'quantization') and image.quantization:
all_metadata['quantization_tables'] = len(image.quantization)
except Exception as e:
all_metadata['quantization_error'] = str(e)
# 8. Extract embedded thumbnails
try:
if hasattr(image, 'info') and 'thumbnail' in image.info:
all_metadata['thumbnail_present'] = True
except:
pass
# 9. Extract text chunks (PNG specific)
try:
if image.format == 'PNG' and hasattr(image, 'text'):
all_metadata['png_text_chunks'] = dict(image.text)
except Exception as e:
all_metadata['png_text_error'] = str(e)
# 10. Extract basic image properties that might contain metadata
basic_props = {
'format': getattr(image, 'format', None),
'mode': getattr(image, 'mode', None),
'size': getattr(image, 'size', None),
'filename': getattr(image, 'filename', None),
'format_description': getattr(image, 'format_description', None)
}
all_metadata['basic_properties'] = basic_props
return all_metadata
def _get_quality_settings_for_format(self, format_name: str, original_mode: str) -> Dict:
"""
Get optimal save settings for maximum quality preservation per format.
ALL metadata will be stripped regardless of format.
Args:
format_name: PIL format name
original_mode: Original image color mode
Returns:
Dictionary of save parameters
"""
settings = {}
if format_name == 'JPEG':
settings = {
'quality': 100, # Maximum quality
'optimize': False, # Don't optimize to preserve quality
'progressive': False, # Standard baseline JPEG
'subsampling': 0, # No chroma subsampling
'exif': b'', # Explicitly remove EXIF
'icc_profile': None # Remove ICC profile
}
elif format_name == 'PNG':
settings = {
'optimize': False, # Don't optimize to preserve quality
'compress_level': 1, # Minimal compression
'icc_profile': None, # Remove ICC profile
'pnginfo': None # Remove PNG info chunks
}
elif format_name == 'WEBP':
settings = {
'lossless': True, # Lossless compression
'quality': 100, # Maximum quality
'method': 6, # Best compression method
'icc_profile': None, # Remove ICC profile
'exif': b'' # Remove EXIF
}
elif format_name == 'TIFF':
settings = {
'compression': None, # No compression
'icc_profile': None # Remove ICC profile
}
elif format_name in ['BMP', 'GIF']:
settings = {} # These formats have limited metadata anyway
return settings
def _clean_png_with_transparency(self, image: Image.Image, output_path: str) -> bool:
"""
Clean PNG image while preserving transparency and quality, removing ALL metadata.
Args:
image: PIL Image object
output_path: Path for output file
Returns:
True if successful
"""
try:
original_mode = image.mode
# Create completely new clean image preserving only visual data
if original_mode == 'P':
# Palette mode - preserve only palette and pixel data
clean_img = Image.new(original_mode, image.size)
if image.getpalette():
clean_img.putpalette(image.getpalette())
clean_img.paste(image, (0, 0))
else:
# RGBA or LA mode - preserve only pixel data
clean_img = Image.new(original_mode, image.size, (0, 0, 0, 0))
clean_img.paste(image, (0, 0))
# Ensure no metadata is carried over
clean_img.info = {}
# Save with quality preservation and NO metadata
save_settings = self._get_quality_settings_for_format('PNG', original_mode)
clean_img.save(output_path, format='PNG', **save_settings)
return True
except Exception as e:
raise Exception(f"PNG transparency cleaning failed: {e}")
def _clean_jpeg_image(self, image: Image.Image, output_path: str) -> bool:
"""
Clean JPEG image while preserving maximum quality, removing ALL metadata.
Args:
image: PIL Image object
output_path: Path for output file
Returns:
True if successful
"""
try:
# Convert to RGB if necessary and create clean copy
if image.mode != 'RGB':
clean_img = image.convert('RGB')
else:
# Create new image to ensure no metadata transfer
clean_img = Image.new('RGB', image.size)
clean_img.paste(image, (0, 0))
# Ensure no metadata is carried over
clean_img.info = {}
# Save with maximum quality and absolutely NO metadata
save_settings = self._get_quality_settings_for_format('JPEG', image.mode)
clean_img.save(output_path, format='JPEG', **save_settings)
return True
except Exception as e:
raise Exception(f"JPEG cleaning failed: {e}")
def _clean_other_format(self, image: Image.Image, output_path: str, original_format: str) -> bool:
"""
Clean other image formats while preserving quality, removing ALL metadata.
Args:
image: PIL Image object
output_path: Path for output file
original_format: Original image format
Returns:
True if successful
"""
try:
# Create completely new image without any metadata transfer
clean_img = Image.new(image.mode, image.size)
# Handle palette images (preserve only palette, not metadata)
if image.mode == 'P' and image.getpalette():
clean_img.putpalette(image.getpalette())
# Paste only pixel data
clean_img.paste(image, (0, 0))
# Ensure absolutely no metadata is carried over
clean_img.info = {}
# Save with format-specific quality settings and no metadata
save_settings = self._get_quality_settings_for_format(original_format, image.mode)
clean_img.save(output_path, format=original_format, **save_settings)
return True
except Exception as e:
raise Exception(f"{original_format} cleaning failed: {e}")
def _verify_metadata_removal(self, output_path) -> bool:
"""
Comprehensive verification that ALL metadata has been removed.
Returns:
True if ALL metadata was successfully removed
"""
try:
with Image.open(output_path) as img:
remaining_metadata = self._extract_all_metadata(img)
# Check for any remaining metadata beyond basic properties
sensitive_keys = ['exif_raw', 'exif_decoded', 'info', 'icc_profile_present',
'xmp', 'iptc', 'pil_tag', 'pil_tag_v2', 'quantization_tables',
'thumbnail_present', 'png_text_chunks']
remaining_sensitive = {k: v for k, v in remaining_metadata.items()
if k in sensitive_keys and v}
if remaining_sensitive:
logger_config.warning(f"WARNING: Sensitive metadata still present: {list(remaining_sensitive.keys())}")
logger_config.debug(f"Remaining metadata: {remaining_sensitive}")
return False
else:
logger_config.success("ALL metadata successfully removed - image is clean")
return True
except Exception as e:
raise Exception(f"Verification failed: {e}")
def _verify_image_quality(self, original_path: str, cleaned_path: str) -> bool:
"""
Verify that image quality and dimensions are preserved.
Args:
original_path: Path to original image
cleaned_path: Path to cleaned image
Returns:
True if quality is preserved
"""
try:
with Image.open(original_path) as original, Image.open(cleaned_path) as cleaned:
# Check dimensions
if original.size != cleaned.size:
raise Exception(f"Dimension mismatch! Original: {original.size}, Cleaned: {cleaned.size}")
# Check if file exists and has reasonable size
if not os.path.exists(cleaned_path):
return False
cleaned_size = os.path.getsize(cleaned_path)
if cleaned_size == 0:
raise Exception("Cleaned file is empty")
logger_config.success(f"Quality preserved - Size: {cleaned.size}, File size: {cleaned_size:,} bytes")
return True
except Exception as e:
raise Exception(f"Quality verification failed: {e}")
def process(self, input_file_name: str):
"""
Remove ALL metadata from an image for complete security and privacy protection.
This method removes:
- EXIF data (camera settings, GPS coordinates, timestamps)
- IPTC data (keywords, captions, copyright)
- XMP data (Adobe metadata)
- ICC color profiles
- Embedded thumbnails
- Quantization tables
- PNG text chunks
- Any other embedded metadata
Args:
input_file_name: Input image name
Returns:
Tuple containing (output_path, extracted_metadata)
Raises:
FileNotFoundError: If input file doesn't exist
ValueError: If file format is not supported
Exception: If metadata removal fails
"""
try:
self.input_file_name = input_file_name
self.input_file_path = f'{self.input_dir}/{self.input_file_name}'
# Validate input
self._validate_input_file()
# Generate output path
output_path = self._generate_output_path()
logger_config.info(f"Processing: {self.input_file_path}")
# Open and analyze image
with Image.open(self.input_file_path) as image:
original_format = image.format
original_mode = image.mode
# Extract ALL metadata for security analysis
extracted_metadata = self._extract_all_metadata(image)
# Print comprehensive metadata analysis
logger_config.info("=== COMPREHENSIVE METADATA ANALYSIS ===")
if extracted_metadata:
for key, value in extracted_metadata.items():
if value: # Only show non-empty metadata
logger_config.info(f"{key}: {value}")
else:
logger_config.info("No metadata found in original image")
logger_config.info("=== END METADATA ANALYSIS ===")
# Choose appropriate cleaning method based on format
success = False
if original_format == 'PNG' and original_mode in ('RGBA', 'LA', 'P'):
success = self._clean_png_with_transparency(image, output_path)
elif original_format == 'JPEG' or output_path.lower().endswith(('.jpg', '.jpeg')):
success = self._clean_jpeg_image(image, output_path)
else:
success = self._clean_other_format(image, output_path, original_format)
if not success:
raise Exception("Metadata cleaning failed")
# Verify results
self._verify_image_quality(self.input_file_path, output_path)
# Comprehensive metadata removal verification
self._verify_metadata_removal(output_path)
logger_config.success(f"Cleaned image saved: {output_path}")
logger_config.success("ALL METADATA REMOVED - Image is secure for sharing")
return output_path, extracted_metadata
except Exception as e:
logger_config.error(f"Failed to clean image: {str(e)}")
return None, {}
# Example usage
if __name__ == "__main__":
cleaner = RemoveMetadata()
cleaned_file, metadata = cleaner.process("image/input/test.png")
print(f"Output: {cleaned_file}")
print(f"Extracted metadata: {metadata}")
print("Image is now secure for sharing - all metadata removed!")