#!/usr/bin/env python
#
# Aufgabe 2: Auffinden von identischen Dateien
# Python-Kurs SS 2003: http://www.inf.tu-dresden.de/python-kurs/
# Josef Spillner <js177634@inf.tu-dresden.de>

# Module einbinden
import sys
import os

# Initialisierungen
filesizes = {}
revsizes = {}
filehashes = {}
revhashes = {}
inodes = {}
verbose = 0

# Hilfsfunktion: holt Informationen über alle relevanten Dateien
def pathcallback(arg, dirname, fnames):
	if verbose == 1:
		print "* durchsuche ", dirname
	for file in fnames:
		canonicalname = dirname + "/" + file
		if os.path.isfile(canonicalname):
			if not os.path.islink(canonicalname):
				ret = os.stat(canonicalname)
				size = ret[6]
				key = str(ret[1]) + ":" + str(ret[2])
				if not inodes.has_key(key):
					inodes[key] = 1
					if verbose == 1:
						print "* prüfe ", canonicalname
					if not filesizes.has_key(size):
						filesizes[size] = []
					filesizes[size].append(canonicalname)
					revsizes[canonicalname] = size

# Durchsuchen aller angegebenen Verzeichnisse
if len(sys.argv) == 1:
	os.path.walk(".", pathcallback, [])
else:
	for arg in sys.argv[1:]:
		os.path.walk(arg, pathcallback, [])

# Vergleichen aller Dateien mit derselben Größe
if verbose == 1:
	print "* vergleiche..."
for size in filesizes.keys():
	if len(filesizes[size]) > 1:
		for file in filesizes[size]:
			path = "'%s'" % file.replace("'", "'\"'\"'")
			ret = os.popen("md5sum " + path + " 2>/dev/null")
			result = ret.readline()
			if result != "":
				md5 = result.split(" ")[0]
				if not filehashes.has_key(md5):
					filehashes[md5] = []
				filehashes[md5].append(file)
				if not revhashes.has_key(md5):
					revhashes[size] = []
				revhashes[size].append(md5)

# Ausgabe der Ergebnisse
if verbose == 1:
	print "* fertig"
sizelist = revhashes.keys()
sizelist.sort()
for size in sizelist:
	for md5 in revhashes[size]:
		if len(filehashes[md5]) > 1:
			print str(revsizes[filehashes[md5][0]]) + ":"
			for file in filehashes[md5]:
				print "\t", file



syntax highlighted by Code2HTML, v. 0.9.1