Checking whether files are the same (perl)

From HalfgeekKB
Revision as of 23:34, 5 November 2005 by 161.253.47.104 (talk)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigation Jump to search

A common way to determine whether two files are identical is, of course, to first compare their file sizes, then, if the same size, compare their message digests.

The following program is one I wrote to traverse the result of find . in an iTunes music directory seeking similarly-named duplicates. (The duplicate of a given file base.ext would be base 1.ext, base 2.ext, et c.) The program is implemented as a set of memory functions.

#! /usr/bin/perl

use warnings;
use strict;

use Digest::SHA1;
my $digest = 'Digest::SHA1';

my %tab = ();

sub file_entry ($) {
	my $file = shift;

	unless( exists $tab{$file} ) {
		if( -e $file ) {
			$tab{$file} = [];
		} else {
			$tab{$file} = [-1];
		}
	}

	return $tab{$file};
}

sub file_size ($) {
	my $file = shift;
	my $e = file_entry($file);

	unless( defined $e->[0] ) {
		$e->[0] = -s $file;
	}

	return $e->[0];
}

sub file_hash ($) {

	my $file = shift;
	my $e = file_entry($file);

	unless( defined $e->[1] ) {
		return undef if file_size($file) < 0;
		my $ctx = $digest->new;
		my $fp;
		open($fp, $file) or return undef;
		$ctx->add(*$fp);
		$e->[1] = $ctx->b64digest;
		close $fp;
	}

	return $e->[1];
}

sub files_equiv ($$) {
	my($a,$b) = @_;
	return 0 if file_size($a) < 0;
	return 0 if file_size($b) < 0;
	return 0 if +( file_size($a) != file_size($b) );
	return 0 if +( file_hash($a) ne file_hash($b) );
	return 1;
}

my %remove = ();

while( <STDIN> ) {
	chomp;
	next if $_ eq '.';
	next if $_ eq '..';
	next unless /^(.*?)\.([0-9a-zA-Z]+)$/;
	my $base = $1;
	my $ext = $2;
	my $file = $_;
	print STDERR "Processing: $file\n";
	for my $i (1 .. 256) {
		my $alt = "$base $i.$ext";
		if( files_equiv($file,$alt) ) {
			print STDERR "-- Duplicated in $alt\n";
			$remove{$alt} = 1;
		}
	}
}

print STDERR "Generating commands to remove files\n";

foreach( sort keys %remove ) {
	print qq(rm "$_"\n);
}