#!/usr/bin/env python
#
# Aufgabe 2: Auffinden von identischen Dateien
# Python-Kurs SS 2003: http://www.inf.tu-dresden.de/python-kurs/
# Josef Spillner <js177634@inf.tu-dresden.de>
# Module einbinden
import sys
import os
# Initialisierungen
filesizes = {}
revsizes = {}
filehashes = {}
revhashes = {}
inodes = {}
verbose = 0
# Hilfsfunktion: holt Informationen über alle relevanten Dateien
def pathcallback(arg, dirname, fnames):
if verbose == 1:
print "* durchsuche ", dirname
for file in fnames:
canonicalname = dirname + "/" + file
if os.path.isfile(canonicalname):
if not os.path.islink(canonicalname):
ret = os.stat(canonicalname)
size = ret[6]
key = str(ret[1]) + ":" + str(ret[2])
if not inodes.has_key(key):
inodes[key] = 1
if verbose == 1:
print "* prüfe ", canonicalname
if not filesizes.has_key(size):
filesizes[size] = []
filesizes[size].append(canonicalname)
revsizes[canonicalname] = size
# Durchsuchen aller angegebenen Verzeichnisse
if len(sys.argv) == 1:
os.path.walk(".", pathcallback, [])
else:
for arg in sys.argv[1:]:
os.path.walk(arg, pathcallback, [])
# Vergleichen aller Dateien mit derselben Größe
if verbose == 1:
print "* vergleiche..."
for size in filesizes.keys():
if len(filesizes[size]) > 1:
for file in filesizes[size]:
path = "'%s'" % file.replace("'", "'\"'\"'")
ret = os.popen("md5sum " + path + " 2>/dev/null")
result = ret.readline()
if result != "":
md5 = result.split(" ")[0]
if not filehashes.has_key(md5):
filehashes[md5] = []
filehashes[md5].append(file)
if not revhashes.has_key(md5):
revhashes[size] = []
revhashes[size].append(md5)
# Ausgabe der Ergebnisse
if verbose == 1:
print "* fertig"
sizelist = revhashes.keys()
sizelist.sort()
for size in sizelist:
for md5 in revhashes[size]:
if len(filehashes[md5]) > 1:
print str(revsizes[filehashes[md5][0]]) + ":"
for file in filehashes[md5]:
print "\t", file
syntax highlighted by Code2HTML, v. 0.9.1