Spaces:

jebin2
/

Tools

Sleeping

App Files Files Community

Tools / image /remove_metadata.py

jebin2

remove metadata issue fix

c8ef5ef about 2 months ago

raw

history blame contribute delete

14.7 kB

	"""
	Image Metadata Remover Module

	This module provides a comprehensive class-based metadata remover that strips ALL metadata
	from images while preserving original quality, format, and aspect ratio.
	Handles EXIF, IPTC, XMP, ICC profiles, and all other embedded metadata for security/privacy.
	"""

	from .image_base import ImageBase
	import os
	from custom_logger import logger_config
	from PIL import Image
	from PIL.ExifTags import TAGS, GPSTAGS
	from typing import Dict
	import base64

	class RemoveMetadata(ImageBase):
	"""
	Comprehensive image metadata remover for security and privacy.

	Removes ALL metadata types including EXIF, IPTC, XMP, ICC profiles, thumbnails,
	GPS data, camera settings, and any other embedded information.
	"""
	def __init__(self):
	super().__init__("remove_metadata")

	def _extract_all_metadata(self, image: Image.Image):
	"""
	Extract ALL available metadata from the image for security analysis.

	Args:
	image: PIL Image object

	Returns:
	Dictionary containing comprehensive metadata extraction
	"""
	all_metadata = {}

	# 1. Extract EXIF data (most common and detailed)
	try:
	if hasattr(image, '_getexif') and image._getexif():
	exif_dict = image._getexif()
	all_metadata['exif_raw'] = exif_dict

	# Decode EXIF tags to human-readable format
	exif_decoded = {}
	for tag_id, value in exif_dict.items():
	tag = TAGS.get(tag_id, tag_id)

	# Special handling for GPS data
	if tag == 'GPSInfo':
	gps_data = {}
	for gps_tag_id, gps_value in value.items():
	gps_tag = GPSTAGS.get(gps_tag_id, gps_tag_id)
	gps_data[gps_tag] = gps_value
	exif_decoded[tag] = gps_data
	else:
	exif_decoded[tag] = value

	all_metadata['exif_decoded'] = exif_decoded
	except Exception as e:
	all_metadata['exif_error'] = str(e)

	# 2. Extract all image.info metadata (includes IPTC, XMP, ICC, etc.)
	if hasattr(image, 'info') and image.info:
	info_data = {}
	for key, value in image.info.items():
	# Handle binary data by encoding it
	if isinstance(value, bytes):
	try:
	# Try to decode as text first
	info_data[key] = value.decode('utf-8', errors='ignore')
	except:
	# If that fails, encode as base64 for viewing
	info_data[key] = f"<binary_data_base64>{base64.b64encode(value[:100]).decode()}</binary_data_base64>"
	else:
	info_data[key] = value
	all_metadata['info'] = info_data

	# 3. Extract ICC Profile (color management)
	try:
	if hasattr(image, 'info') and 'icc_profile' in image.info:
	icc_profile = image.info['icc_profile']
	all_metadata['icc_profile_size'] = len(icc_profile) if icc_profile else 0
	all_metadata['icc_profile_present'] = bool(icc_profile)
	except Exception as e:
	all_metadata['icc_error'] = str(e)

	# 4. Extract XMP metadata
	try:
	if hasattr(image, 'info') and 'xmp' in image.info:
	xmp_data = image.info['xmp']
	if isinstance(xmp_data, bytes):
	all_metadata['xmp'] = xmp_data.decode('utf-8', errors='ignore')
	else:
	all_metadata['xmp'] = xmp_data
	except Exception as e:
	all_metadata['xmp_error'] = str(e)

	# 5. Extract IPTC metadata
	try:
	if hasattr(image, 'info') and any(key.startswith('iptc') for key in image.info.keys()):
	iptc_data = {k: v for k, v in image.info.items() if k.startswith('iptc')}
	all_metadata['iptc'] = iptc_data
	except Exception as e:
	all_metadata['iptc_error'] = str(e)

	# 6. Extract PIL tag and tag_v2 (alternative EXIF access)
	try:
	if hasattr(image, 'tag') and image.tag:
	all_metadata['pil_tag'] = dict(image.tag)
	except Exception as e:
	all_metadata['pil_tag_error'] = str(e)

	try:
	if hasattr(image, 'tag_v2') and image.tag_v2:
	all_metadata['pil_tag_v2'] = dict(image.tag_v2)
	except Exception as e:
	all_metadata['pil_tag_v2_error'] = str(e)

	# 7. Extract quantization tables (JPEG compression info)
	try:
	if hasattr(image, 'quantization') and image.quantization:
	all_metadata['quantization_tables'] = len(image.quantization)
	except Exception as e:
	all_metadata['quantization_error'] = str(e)

	# 8. Extract embedded thumbnails
	try:
	if hasattr(image, 'info') and 'thumbnail' in image.info:
	all_metadata['thumbnail_present'] = True
	except:
	pass

	# 9. Extract text chunks (PNG specific)
	try:
	if image.format == 'PNG' and hasattr(image, 'text'):
	all_metadata['png_text_chunks'] = dict(image.text)
	except Exception as e:
	all_metadata['png_text_error'] = str(e)

	# 10. Extract basic image properties that might contain metadata
	basic_props = {
	'format': getattr(image, 'format', None),
	'mode': getattr(image, 'mode', None),
	'size': getattr(image, 'size', None),
	'filename': getattr(image, 'filename', None),
	'format_description': getattr(image, 'format_description', None)
	}
	all_metadata['basic_properties'] = basic_props

	return all_metadata

	def _get_quality_settings_for_format(self, format_name: str, original_mode: str) -> Dict:
	"""
	Get optimal save settings for maximum quality preservation per format.
	ALL metadata will be stripped regardless of format.

	Args:
	format_name: PIL format name
	original_mode: Original image color mode

	Returns:
	Dictionary of save parameters
	"""
	settings = {}

	if format_name == 'JPEG':
	settings = {
	'quality': 100, # Maximum quality
	'optimize': False, # Don't optimize to preserve quality
	'progressive': False, # Standard baseline JPEG
	'subsampling': 0, # No chroma subsampling
	'exif': b'', # Explicitly remove EXIF
	'icc_profile': None # Remove ICC profile
	}

	elif format_name == 'PNG':
	settings = {
	'optimize': False, # Don't optimize to preserve quality
	'compress_level': 1, # Minimal compression
	'icc_profile': None, # Remove ICC profile
	'pnginfo': None # Remove PNG info chunks
	}

	elif format_name == 'WEBP':
	settings = {
	'lossless': True, # Lossless compression
	'quality': 100, # Maximum quality
	'method': 6, # Best compression method
	'icc_profile': None, # Remove ICC profile
	'exif': b'' # Remove EXIF
	}

	elif format_name == 'TIFF':
	settings = {
	'compression': None, # No compression
	'icc_profile': None # Remove ICC profile
	}

	elif format_name in ['BMP', 'GIF']:
	settings = {} # These formats have limited metadata anyway

	return settings

	def _clean_png_with_transparency(self, image: Image.Image, output_path: str) -> bool:
	"""
	Clean PNG image while preserving transparency and quality, removing ALL metadata.

	Args:
	image: PIL Image object
	output_path: Path for output file

	Returns:
	True if successful
	"""
	try:
	original_mode = image.mode

	# Create completely new clean image preserving only visual data
	if original_mode == 'P':
	# Palette mode - preserve only palette and pixel data
	clean_img = Image.new(original_mode, image.size)
	if image.getpalette():
	clean_img.putpalette(image.getpalette())
	clean_img.paste(image, (0, 0))
	else:
	# RGBA or LA mode - preserve only pixel data
	clean_img = Image.new(original_mode, image.size, (0, 0, 0, 0))
	clean_img.paste(image, (0, 0))

	# Ensure no metadata is carried over
	clean_img.info = {}

	# Save with quality preservation and NO metadata
	save_settings = self._get_quality_settings_for_format('PNG', original_mode)
	clean_img.save(output_path, format='PNG', **save_settings)

	return True

	except Exception as e:
	raise Exception(f"PNG transparency cleaning failed: {e}")

	def _clean_jpeg_image(self, image: Image.Image, output_path: str) -> bool:
	"""
	Clean JPEG image while preserving maximum quality, removing ALL metadata.

	Args:
	image: PIL Image object
	output_path: Path for output file

	Returns:
	True if successful
	"""
	try:
	# Convert to RGB if necessary and create clean copy
	if image.mode != 'RGB':
	clean_img = image.convert('RGB')
	else:
	# Create new image to ensure no metadata transfer
	clean_img = Image.new('RGB', image.size)
	clean_img.paste(image, (0, 0))

	# Ensure no metadata is carried over
	clean_img.info = {}

	# Save with maximum quality and absolutely NO metadata
	save_settings = self._get_quality_settings_for_format('JPEG', image.mode)
	clean_img.save(output_path, format='JPEG', **save_settings)

	return True

	except Exception as e:
	raise Exception(f"JPEG cleaning failed: {e}")

	def _clean_other_format(self, image: Image.Image, output_path: str, original_format: str) -> bool:
	"""
	Clean other image formats while preserving quality, removing ALL metadata.

	Args:
	image: PIL Image object
	output_path: Path for output file
	original_format: Original image format

	Returns:
	True if successful
	"""
	try:
	# Create completely new image without any metadata transfer
	clean_img = Image.new(image.mode, image.size)

	# Handle palette images (preserve only palette, not metadata)
	if image.mode == 'P' and image.getpalette():
	clean_img.putpalette(image.getpalette())

	# Paste only pixel data
	clean_img.paste(image, (0, 0))

	# Ensure absolutely no metadata is carried over
	clean_img.info = {}

	# Save with format-specific quality settings and no metadata
	save_settings = self._get_quality_settings_for_format(original_format, image.mode)
	clean_img.save(output_path, format=original_format, **save_settings)

	return True

	except Exception as e:
	raise Exception(f"{original_format} cleaning failed: {e}")

	def _verify_metadata_removal(self, output_path) -> bool:
	"""
	Comprehensive verification that ALL metadata has been removed.
	Returns:
	True if ALL metadata was successfully removed
	"""
	try:
	with Image.open(output_path) as img:
	remaining_metadata = self._extract_all_metadata(img)

	# Check for any remaining metadata beyond basic properties
	sensitive_keys = ['exif_raw', 'exif_decoded', 'info', 'icc_profile_present',
	'xmp', 'iptc', 'pil_tag', 'pil_tag_v2', 'quantization_tables',
	'thumbnail_present', 'png_text_chunks']

	remaining_sensitive = {k: v for k, v in remaining_metadata.items()
	if k in sensitive_keys and v}

	if remaining_sensitive:
	logger_config.warning(f"WARNING: Sensitive metadata still present: {list(remaining_sensitive.keys())}")
	logger_config.debug(f"Remaining metadata: {remaining_sensitive}")
	return False
	else:
	logger_config.success("ALL metadata successfully removed - image is clean")
	return True

	except Exception as e:
	raise Exception(f"Verification failed: {e}")

	def _verify_image_quality(self, original_path: str, cleaned_path: str) -> bool:
	"""
	Verify that image quality and dimensions are preserved.

	Args:
	original_path: Path to original image
	cleaned_path: Path to cleaned image

	Returns:
	True if quality is preserved
	"""
	try:
	with Image.open(original_path) as original, Image.open(cleaned_path) as cleaned:
	# Check dimensions
	if original.size != cleaned.size:
	raise Exception(f"Dimension mismatch! Original: {original.size}, Cleaned: {cleaned.size}")

	# Check if file exists and has reasonable size
	if not os.path.exists(cleaned_path):
	return False

	cleaned_size = os.path.getsize(cleaned_path)
	if cleaned_size == 0:
	raise Exception("Cleaned file is empty")

	logger_config.success(f"Quality preserved - Size: {cleaned.size}, File size: {cleaned_size:,} bytes")
	return True

	except Exception as e:
	raise Exception(f"Quality verification failed: {e}")

	def process(self, input_file_name: str):
	"""
	Remove ALL metadata from an image for complete security and privacy protection.

	This method removes:
	- EXIF data (camera settings, GPS coordinates, timestamps)
	- IPTC data (keywords, captions, copyright)
	- XMP data (Adobe metadata)
	- ICC color profiles
	- Embedded thumbnails
	- Quantization tables
	- PNG text chunks
	- Any other embedded metadata

	Args:
	input_file_name: Input image name

	Returns:
	Tuple containing (output_path, extracted_metadata)

	Raises:
	FileNotFoundError: If input file doesn't exist
	ValueError: If file format is not supported
	Exception: If metadata removal fails
	"""
	try:
	self.input_file_name = input_file_name
	self.input_file_path = f'{self.input_dir}/{self.input_file_name}'
	# Validate input
	self._validate_input_file()

	# Generate output path
	output_path = self._generate_output_path()

	logger_config.info(f"Processing: {self.input_file_path}")

	# Open and analyze image
	with Image.open(self.input_file_path) as image:
	original_format = image.format
	original_mode = image.mode

	# Extract ALL metadata for security analysis
	extracted_metadata = self._extract_all_metadata(image)

	# Print comprehensive metadata analysis
	logger_config.info("=== COMPREHENSIVE METADATA ANALYSIS ===")
	if extracted_metadata:
	for key, value in extracted_metadata.items():
	if value: # Only show non-empty metadata
	logger_config.info(f"{key}: {value}")
	else:
	logger_config.info("No metadata found in original image")
	logger_config.info("=== END METADATA ANALYSIS ===")

	# Choose appropriate cleaning method based on format
	success = False

	if original_format == 'PNG' and original_mode in ('RGBA', 'LA', 'P'):
	success = self._clean_png_with_transparency(image, output_path)

	elif original_format == 'JPEG' or output_path.lower().endswith(('.jpg', '.jpeg')):
	success = self._clean_jpeg_image(image, output_path)

	else:
	success = self._clean_other_format(image, output_path, original_format)

	if not success:
	raise Exception("Metadata cleaning failed")

	# Verify results
	self._verify_image_quality(self.input_file_path, output_path)

	# Comprehensive metadata removal verification
	self._verify_metadata_removal(output_path)

	logger_config.success(f"Cleaned image saved: {output_path}")
	logger_config.success("ALL METADATA REMOVED - Image is secure for sharing")
	return output_path, extracted_metadata

	except Exception as e:
	logger_config.error(f"Failed to clean image: {str(e)}")
	return None, {}

	# Example usage
	if __name__ == "__main__":
	cleaner = RemoveMetadata()
	cleaned_file, metadata = cleaner.process("image/input/test.png")
	print(f"Output: {cleaned_file}")
	print(f"Extracted metadata: {metadata}")
	print("Image is now secure for sharing - all metadata removed!")