| #!/usr/bin/env python3 |
| # Copyright (C) 2024 Apple Inc. All rights reserved. |
| # |
| # Redistribution and use in source and binary forms, with or without |
| # modification, are permitted provided that the following conditions |
| # are met: |
| # 1. Redistributions of source code must retain the above copyright |
| # notice, this list of conditions and the following disclaimer. |
| # 2. Redistributions in binary form must reproduce the above copyright |
| # notice, this list of conditions and the following disclaimer in the |
| # documentation and/or other materials provided with the distribution. |
| # |
| # THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS "AS IS" AND |
| # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| # DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR |
| # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
| # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
| # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| import argparse |
| import errno |
| import hashlib |
| import os |
| import sys |
| |
| CHUNK_SIZE = 8192 |
| |
| |
| def parse_args(args): |
| parser = argparse.ArgumentParser( |
| description='Program which de-duplicates PNG files in a directory using hard links' |
| ) |
| parser.add_argument( |
| 'root', nargs='?', |
| help='Path to root of directory to de-duplicate PNG files in', |
| type=str, default='.', |
| ) |
| parser.add_argument( |
| '-s', '--storage', |
| help='Directory to store images the program is linking to', |
| type=str, |
| default='storage', |
| ) |
| parser.add_argument( |
| '-d', '--dry-run', |
| action='store_true', |
| dest='dry_run', |
| default=False, |
| ) |
| return parser.parse_args(args) |
| |
| |
| def sha1(path): |
| try: |
| with open(path, 'rb') as file: |
| hsh = hashlib.sha1() |
| chunk = file.read(CHUNK_SIZE) |
| while chunk: |
| hsh.update(chunk) |
| chunk = file.read(CHUNK_SIZE) |
| return hsh.hexdigest() |
| except OSError: |
| return None |
| |
| |
| def main(args): |
| args = parse_args(args) |
| |
| args.root = os.path.abspath(args.root) |
| args.storage = os.path.join(args.root, args.storage) |
| if not os.path.isdir(args.root): |
| print(f"'{args.root}' is a path which does not exist\n", file=sys.stderr) |
| return 1 |
| storage_parent = os.path.dirname(args.storage) |
| if not os.path.isdir(storage_parent): |
| print(f"'{storage_parent}' is a path which does not exist\n", file=sys.stderr) |
| return 1 |
| |
| if not os.path.isdir(args.storage): |
| os.mkdir(args.storage) |
| |
| cleaned_up = 0 |
| de_duped = 0 |
| processed = 0 |
| |
| link_count = 0 |
| with os.scandir(args.storage) as existing_files: |
| for existing_file in existing_files: |
| if not existing_file.name.startswith('.') and existing_file.is_file(): |
| continue |
| if '.' not in existing_file.name: |
| continue |
| sha_value, extension = existing_file.name.split('.', 1) |
| if extension != 'png': |
| continue |
| connected_to = existing_file.inode() |
| if connected_to <= 1: |
| if args.dry_run: |
| print(f'[dry-run] Removing {existing_file.path}') |
| else: |
| os.remove(existing_file.path) |
| cleaned_up += 1 |
| else: |
| link_count += 1 |
| |
| for root, _, files in os.walk(args.root, topdown=False): |
| if os.path.samefile(root, args.storage): |
| continue |
| |
| for file in files: |
| if not file.endswith('.png'): |
| continue |
| processed += 1 |
| full_path = os.path.join(root, file) |
| if os.stat(full_path).st_nlink > 1: |
| continue |
| sha_value = sha1(full_path) |
| sha_path = os.path.join(args.storage, f'{sha_value}.png') |
| assert sha_path != full_path |
| if os.path.isfile(sha_path): |
| if args.dry_run: |
| print(f'[dry-run] Linking {full_path} to {sha_value}.png') |
| else: |
| try: |
| tempfile = os.path.join(root, f'{sha_value}.png') |
| os.link(sha_path, tempfile) |
| os.replace(tempfile, full_path) |
| except OSError as e: |
| if e.errno == errno.EMLINK: |
| # sha_path already has LINK_MAX links, so create a copy of that file at |
| # full_path, and create a new hard link within storage pointing at another |
| # identical file. In practice, the previous network of hard-linked files will |
| # slowly be deleted as old results are pruned. |
| |
| # We don't use os.replace here because we don't care if the reference file is deleted. |
| os.remove(sha_path) |
| os.link(full_path, sha_path) |
| else: |
| raise |
| de_duped += 1 |
| elif not sha_value: |
| print(f"Failed to compute SHA for '{full_path}'\n", file=sys.stderr) |
| else: |
| if args.dry_run: |
| print(f'[dry-run] Storing and linking {full_path} to {sha_value}.png') |
| else: |
| os.link(full_path, sha_path) |
| link_count += 1 |
| |
| print(f'{link_count} unique png files in storage of {processed} processed') |
| print(f'{cleaned_up} files removed, {de_duped} de-duped') |
| |
| return 0 |
| |
| if '__main__' == __name__: |
| sys.exit(main(sys.argv[1:])) |