#!/usr/bin/env python # #usage: get.py anyfile #or: get.py http://anything.anywhere.com/anyfile.html # #we need regular expressions, ability to access files via http, #and access to command line arguments: import re, urllib2, sys filename=sys.argv[1] #get filename from command line re_http=re.compile(r"http://"); #pattern to check if file is access by http #read file into a string: if re_http.search(filename): print "connecting to: ",filename content=urllib2.urlopen(filename).read() else: print "reading local file: ",filename infile=open(filename,"r") content=infile.read() #make a dictionary (= perl's hash) of patterns to match and exclude pats={} #initialize dictionary notpats={} #ditto #following are the patterns regular expressions to match. #note the r"""..., this means it is a raw string, which means nothing #special is done with the backslashes \; that specialness is preserved #for the pattern matching operation #note that the part of the pattern between the () is what is printed out # see "4.2 re" at http://python.org/doc/current/lib/lib.html: pats['http']=r"""http://(.*?)[\"\s\']""" pats['relative url']=r"""href=\s*[\"\'](.*?)[\"\']""" notpats['relative url']=r"""http://|mailto:""" pats['email address']=r"""([\w\.]+\@[\w\.]+)""" #your task: comment out the above and make the following functional: #find url of frames, (check at wwww.ou.edu): #pats['frame']= #find "Bobby noncompliant images, i.e. images without alt tag #check at: # http://soonersports.ocsn.com # http://it.metr.ou.edu/rgraphics/radar # http://weather.ou.edu, etc. #pats['bobby violation']= #notpats['bobby violation']= #find downloadable pdf documents. check at http://www.ou.edu/provost/pronew/ #pats['pdf']= keys=pats.keys() keys.sort() for key in keys: pat=re.compile(pats[key],re.S) # S option means .* can include newline notpat="" if notpats.has_key(key): notpat=re.compile(notpats[key],re.S) print "\n",key,": looking for ",pats[key], if notpat: print "without ", notpats[key] else: print "" count=0 for thematch in pat.findall(content): if notpat: if notpat.search(thematch): continue count=count+1 print count,": ",thematch