#  Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# 
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
# 
#  http://www.apache.org/licenses/LICENSE-2.0
# 
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
import numpy as np
import os

def filter_npz_by_filenames(npz_path, txt_path, output_path):
    
    data_list = np.load(npz_path, allow_pickle=True)['arr_0']
   
    with open(txt_path, 'r') as f:
        exclude_filenames = set(line.strip() for line in f if line.strip())

    # Filter the data list
    filtered_data = []
    excluded_count = 0
    
    for item in data_list:
            
        filename = item['uuid']
        
        if filename in exclude_filenames:
            excluded_count += 1
            print(filename)
        else:
            filtered_data.append(item)
    
    # Save the filtered data
    kept_count = len(filtered_data)
    total_count = len(data_list)
    print(f"Original items: {total_count}")
    print(f"Kept items: {kept_count}")
    print(f"Removed items: {excluded_count}")
    
    print(f"Saving filtered data")
    np.savez_compressed(output_path, filtered_data, allow_pickle=True) 

def main():
    issue_list = "data_utils/issue_data_list.txt"  # Change this to your text file path
    npz_path_train = "articulation_xlv2_train.npz"  # Change this to your NPZ file path
    output_path_train = "articulation_xlv2_train_update.npz"
    npz_path_test = "articulation_xlv2_test.npz"  # Change this to your NPZ file path
    output_path_test = "articulation_xlv2_test_update.npz"
    
    filter_npz_by_filenames(npz_path_train, issue_list, output_path_train)
    filter_npz_by_filenames(npz_path_test, issue_list, output_path_test)
        
if __name__ == "__main__":
    main()