#!/usr/bin/env python2.2
#
# Aufgabe 3: Rekursives Laden von Dokumenten über HTTP
# Python-Kurs SS 2003: http://www.inf.tu-dresden.de/python-kurs/
# Josef Spillner <js177634@inf.tu-dresden.de>
# Import der Standardmodule
import urllib
import urlparse
import re
import os
import sys
# Nutzungshinweise
from usage import *
# Optionen, teilweise per Kommandozeile änderbar
opt_help = 0 # Display help screen
opt_dump = 0 # Dump URL contents only
opt_dir = "." # Local directory
urls = [] # List of URLs to fetch
file_hack = 1 # Add /index.html if file doesn't match *.*
intopt_depth = 1 # Limit of recursive retrievals
intopt_verbose = 0 # Verbosity
intopt_cache = 0 # Use cache if possible
# Download einer URL in ein bestimmtes Verzeichnis
def run(urls, dir, dump, depth):
fetchlist = {}
# Jede URL einzeln auswerten
for url in urls:
newurl = url
if depth > 0 and intopt_verbose:
print "Retrieving", newurl
# Unvollständige URLs reparieren
if newurl[-1] == "/":
newurl += "index.html"
components = newurl.split("/")
if len(components) == 3:
components.append("index.html")
path = components[2:-1]
if len(path) == 0:
path = ["/"]
components.append("/")
url = url + "/"
filename = components[-1]
if file_hack and not re.match("\w+\.\w+", filename):
path.append(filename)
filename = "/index.html"
# Rekursiv lokales Verzeichnis erstellen
localfile = dir
try:
os.mkdir(localfile)
except:
pass
for pathname in path:
localfile += "/" + pathname
try:
os.mkdir(localfile)
except:
pass
localfile += "/" + filename
# Neuen Dateinamen finden, falls schon belegt
origfile = localfile
number = 1
while(os.path.isfile(localfile)):
localfile = origfile + "." + str(number)
number = number + 1
# Generischen Pfad aus Pfadliste zusammensetzen
pathname = components[0] + "//"
for pathdir in path:
pathname = pathname + pathdir + "/"
# Versuch eines Downloads, eventuell Cache nutzen
ok = 0
if intopt_cache:
try:
file = open(origfile, "r")
contents = file.read()
ok = 1
if intopt_verbose:
print "Found in cache:", origfile
except:
if intopt_verbose:
print "Not found in cache:", origfile
if ok == 0:
try:
file = urllib.urlopen(url)
contents = file.read()
ok = 1
except:
print "Error fetching", url
if ok:
if intopt_verbose:
print "Save to", localfile
local = open(localfile, "w")
local.writelines(contents)
# Dokument auswerten, falls gewünscht
if depth > 0:
longline = ""
for line in contents:
longline += line
while 1:
# URLs in eine kanonische (absolute) Form bringen
matches = re.search("(src|href)\s*\=\"?(?P<url>[\w\.\/\?\&\=\:\-\#\,\;\~]+)\"?", longline)
if matches and matches.group("url"):
suburl = matches.group("url")
if re.match("^http[s]?\:\/\/.*", suburl):
newurl = suburl
elif re.match("^mailto\:.*", suburl):
# discard
newurl = ""
elif suburl[0] == "#":
newurl = url
elif suburl[0] == "/":
newurl = components[0] + "//" + components[2] + suburl
elif suburl[0] == ".":
suburl = re.sub("^.\/", "", suburl)
suburl = re.sub("\/\.\/", "/", suburl)
newurl = pathname + suburl
while 1:
up = re.search("\/\.\.\/", newurl)
if up:
newurl = re.sub("\/[^\/]+\/\.\.\/", "/", newurl, 1);
else:
break
else:
newurl = pathname + suburl
newurl = re.sub("\#.*$", "", newurl)
if newurl != "":
fetchlist[newurl] = 1
else:
break
longline = re.sub("(src|href)\s*\=\"?([\w\.\/\?\&\=\:\-\#\,\;\~]+)\"?", "", longline, 1)
# Untergeordnete URLs holen, bis die Suchtiefe erreicht ist
keylist = fetchlist.keys()
keylist.sort()
for suburl in keylist:
if dump:
print suburl
else:
print "Recursive lookup:", suburl
run([suburl], dir, dump, depth - 1)
# Einlesen der Kommandozeilenoptionen
idx = 0
skip = 0
for arg in sys.argv:
if arg == "-h" or arg == "--help":
opt_help = 1
elif arg == "-d" or arg == "--dump":
opt_dump = 1
elif arg == "-b" or arg == "--basedir":
if len(sys.argv) > idx + 1:
opt_dir = sys.argv[idx + 1]
skip = 1
else :
print "Missing argument!"
elif arg == sys.argv[0]:
# progname
pass
else:
if skip == 0:
urls += [arg]
else:
skip = 0
idx = idx + 1
# Falls Optionen korrekt sind, Programm starten
if len(urls) == 0 or opt_help:
if len(urls) == 0:
print "Missing URLs!"
usage()
else:
run(urls, opt_dir, opt_dump, intopt_depth)
syntax highlighted by Code2HTML, v. 0.9.1