fetcher.py

#!/usr/bin/env python2.2
#
# Aufgabe 3: Rekursives Laden von Dokumenten über HTTP
# Python-Kurs SS 2003: http://www.inf.tu-dresden.de/python-kurs/
# Josef Spillner <js177634@inf.tu-dresden.de>

# Import der Standardmodule
import urllib
import urlparse
import re
import os
import sys

# Nutzungshinweise
from usage import *

# Optionen, teilweise per Kommandozeile änderbar
opt_help = 0		# Display help screen
opt_dump = 0		# Dump URL contents only
opt_dir = "."		# Local directory
urls = []		# List of URLs to fetch
file_hack = 1		# Add /index.html if file doesn't match *.*
intopt_depth = 1	# Limit of recursive retrievals
intopt_verbose = 0	# Verbosity
intopt_cache = 0	# Use cache if possible

# Download einer URL in ein bestimmtes Verzeichnis
def run(urls, dir, dump, depth):
	fetchlist = {}

	# Jede URL einzeln auswerten
	for url in urls:
		newurl = url
		if depth > 0 and intopt_verbose:
			print "Retrieving", newurl

		# Unvollständige URLs reparieren
		if newurl[-1] == "/":
			newurl += "index.html"
		components = newurl.split("/")
		if len(components) == 3:
			components.append("index.html")
		path = components[2:-1]
		if len(path) == 0:
			path = ["/"]
			components.append("/")
			url = url + "/"
		filename = components[-1]
		if file_hack and not re.match("\w+\.\w+", filename):
			path.append(filename)
			filename = "/index.html"

		# Rekursiv lokales Verzeichnis erstellen
		localfile = dir
		try:
			os.mkdir(localfile)
		except:
			pass
		for pathname in path:
			localfile += "/" + pathname
			try:
				os.mkdir(localfile)
			except:
				pass
		localfile += "/" + filename

		# Neuen Dateinamen finden, falls schon belegt
		origfile = localfile
		number = 1
		while(os.path.isfile(localfile)):
			localfile = origfile + "." + str(number)
			number = number + 1

		# Generischen Pfad aus Pfadliste zusammensetzen
		pathname = components[0] + "//"
		for pathdir in path:
			pathname = pathname + pathdir + "/"

		# Versuch eines Downloads, eventuell Cache nutzen
		ok = 0
		if intopt_cache:
			try:
				file = open(origfile, "r")
				contents = file.read()
				ok = 1
				if intopt_verbose:
					print "Found in cache:", origfile
			except:
				if intopt_verbose:
					print "Not found in cache:", origfile
		if ok == 0:
			try:
				file = urllib.urlopen(url)
				contents = file.read()
				ok = 1
			except:
				print "Error fetching", url

		if ok:
			if intopt_verbose:
				print "Save to", localfile
			local = open(localfile, "w")
			local.writelines(contents)

			# Dokument auswerten, falls gewünscht
			if depth > 0:
				longline = ""
				for line in contents:
					longline += line

				while 1:
					# URLs in eine kanonische (absolute) Form bringen
					matches = re.search("(src|href)\s*\=\"?(?P<url>[\w\.\/\?\&\=\:\-\#\,\;\~]+)\"?", longline)
					if matches and matches.group("url"):
						suburl = matches.group("url")
						if re.match("^http[s]?\:\/\/.*", suburl):
							newurl = suburl
						elif re.match("^mailto\:.*", suburl):
							# discard
							newurl = ""
						elif suburl[0] == "#":
							newurl = url
						elif suburl[0] == "/":
							newurl = components[0] + "//" + components[2] + suburl
						elif suburl[0] == ".":
							suburl = re.sub("^.\/", "", suburl)
							suburl = re.sub("\/\.\/", "/", suburl)
							newurl = pathname + suburl
							while 1:
								up = re.search("\/\.\.\/", newurl)
								if up:
									newurl = re.sub("\/[^\/]+\/\.\.\/", "/", newurl, 1);
								else:
									break
						else:
							newurl = pathname + suburl
						newurl = re.sub("\#.*$", "", newurl)
						if newurl != "":
							fetchlist[newurl] = 1
					else:
						break
					longline = re.sub("(src|href)\s*\=\"?([\w\.\/\?\&\=\:\-\#\,\;\~]+)\"?", "", longline, 1)

				# Untergeordnete URLs holen, bis die Suchtiefe erreicht ist
				keylist = fetchlist.keys()
				keylist.sort()
				for suburl in keylist:
					if dump:
						print suburl
					else:
						print "Recursive lookup:", suburl
						run([suburl], dir, dump, depth - 1)

# Einlesen der Kommandozeilenoptionen
idx = 0
skip = 0
for arg in sys.argv:
	if arg == "-h" or arg == "--help":
		opt_help = 1
	elif arg == "-d" or arg == "--dump":
		opt_dump = 1
	elif arg == "-b" or arg == "--basedir":
		if len(sys.argv) > idx + 1:
			opt_dir = sys.argv[idx + 1]
			skip = 1
		else :
			print "Missing argument!"
	elif arg == sys.argv[0]:
		# progname
		pass
	else:
		if skip == 0:
			urls += [arg]
		else:
			skip = 0
	idx = idx + 1

# Falls Optionen korrekt sind, Programm starten
if len(urls) == 0 or opt_help:
	if len(urls) == 0:
		print "Missing URLs!"
	usage()
else:
	run(urls, opt_dir, opt_dump, intopt_depth)
syntax highlighted by Code2HTML, v. 0.9.1