;;;; System that uses heuristic search to search the web, starting at ;;;; a given initial page and following links trying to find a page that ;;;; contains a given set of key words. Uses beam search with a heuristic ;;;; that in addition to the goal strings considers a list of "help strings" ;;;; to guide the search. The function web-search is the main interface. ;;;------------------------------ ;;; Basic data structures ;;;------------------------------ (defstruct (web-page (:print-function print-web-page)) url ; URL string for page text ; String for text on page score ; Heuristic evaluation score parent-hyperlink) ; Parent link that led to this page in search (defun print-web-page (page stream depth) "Print only URL for a web page" (declare (ignore depth)) (format stream "" (web-page-url page))) (defstruct hyperlink url ; string for URL parameters ; alist of other (parameter value) other than (href URL) text ; string of text in the hyperlink full-text ; complete text of link .... parent-page ; web page this link is in score) ; heuristic evaluation of goodness of following link (defun print-states (states) (format t "~% States in current call to treesearch: ~% ") (mapcar #'(lambda (state) (if (web-page-p state) (format t " web-page: ~A ~%" (web-page-url state)) (format t " hyperlink: ~A ~%" (hyperlink-url state)))) states)) (defstruct url host ; Host name string directory ; Directory name string file) ; File name string (defstruct html-command name ; Name of command, e.g. "a", "p", etc. end ; Is it the end marker for command, e.g. "/a" parameters) ; Alist of parameters and values ;;;------------------------------ ;;; Special character lists ;;;------------------------------ (defparameter *white-chars* '(#\Newline #\Space #\Tab)) (defparameter *punctuation-chars* '(#\! #\@ #\# #\% #\$ #\% #\^ #\& #\* #\( #\) #\- #\_ #\{ #\} #\[ #\] #\: #\; #\" #\' #\, #\. #\? #\/ #\+ #\= #\~ #\` #\< #\> #\| #\\ #\Newline)) ;;;------------------------------ ;;; Search interface ;;;------------------------------ (defun web-search (start-url want-strings help-strings &optional (beam-width 10)) "Conduct the web search using beam search with the appropriate goal, successor, and cost functions (see assignment definition)" (let ((goal-page (beam-search (initialize-web-page start-url want-strings help-strings) #'(lambda (state) (find-all-strings-page want-strings state)) #'web-successors #'(lambda (state) (score-state state want-strings help-strings)) beam-width))) (when goal-page ;; Print solution path if tracing (format t "~&~%Solution Path: ~&~{~&~A~}" (web-path goal-page)) goal-page))) (defun initialize-web-page (url want-strings help-strings) "Create an initial web-page for this URL and score it according to the presence of the want and help strings" (let ((web-page (make-web-page :url url :text (get-url-to-string url)))) (setf (web-page-score web-page) (compute-web-page-score web-page want-strings help-strings)) web-page)) (defun web-path (state) "Return a list of web-pages and hyperlinks specifying the path from the initial start page to this state" (cond ((null state) nil) ((web-page-p state) (append (web-path (web-page-parent-hyperlink state)) (list (web-page-url state)))) ((hyperlink-p state) (append (web-path (hyperlink-parent-page state)) ;; Include link text for links but eliminate line breaks and leading white-space (list (format nil " Follow link: \"~A\" to" (string-trim *white-chars* (substitute #\Space #\Newline (hyperlink-text state))))))) (t (error "Illegal web state")))) (defun find-all-strings-page (strings state) "Return T iff the state is a web page containing all of the strings in the given list." (and (web-page-p state) (every #'(lambda (string) (string-search string (web-page-text state) :test (if (upper-case-p (char string 0)) #'char=-white #'char-equal-white))) strings))) ;;;------------------------------ ;;; Basic search code ;;;------------------------------ ;;; This code implements basic beam search taken from ;;; Peter Norvig, Paradigms of Artificial Intelligence Programming: Case Studies in Common Lisp, ;;; Morgan Kaufman Publishers, San Fancisco, CA, 1992. (defun beam-search (start goal-p successors cost-fn beam-width) "Search highest scoring states first until goal is reached, but never consider more than beam-width states at a time." (tree-search (list start) goal-p successors #'(lambda (old new) (let ((sorted (funcall (sorter cost-fn) old new))) (if (> beam-width (length sorted)) sorted (subseq sorted 0 beam-width)))))) (defun tree-search (states goal-p successors combiner) "Find a state that satisfies goal-p. Start with states, and search according to successors and combiner." (print-states states) (cond ((null states) nil) ((funcall goal-p (first states)) (first states)) (t (tree-search (funcall combiner (funcall successors (first states)) (rest states)) goal-p successors combiner)))) (defun sorter (cost-fn) "Return a combiner function that sorts according to cost-fn." #'(lambda (new old) (sort (append new old) #'< :key cost-fn))) ;;;------------------------------ ;;; Heuristic scoring ;;;------------------------------ (defun score-state (state want-strings help-strings) "Score a state based on if it is a web page or a hyperlink, if the score has not been computed yet, compute it, otherwise compute it and store the value in the score slot for future use to avoid repeated work scoring the same state" (cond ((web-page-p state) (or (web-page-score state) (setf (web-page-score state) (compute-web-page-score state want-strings help-strings)))) ((hyperlink-p state) (or (hyperlink-score state) (setf (hyperlink-score state) (compute-hyperlink-score state want-strings help-strings)))))) (defun compute-web-page-score (web-page want-strings help-strings) "Score a web page based on how well its want and help strings are represented in the text of the page. Make it negative since search code sorts low values first (by cost rather than benefit)" (- (score-match want-strings help-strings (web-page-text web-page)))) (defun compute-hyperlink-score (hyperlink want-strings help-strings) "Score a hyperlink as 1/2 from the score of how well its text matches the want and help strings and 1/2 from the score of its parent page in order to include more surrounding context" (let ((parent-score (web-page-score (hyperlink-parent-page hyperlink)))) (unless parent-score (setf parent-score (setf (web-page-score (hyperlink-parent-page hyperlink)) (compute-web-page-score (hyperlink-parent-page hyperlink) want-strings help-strings)))) (+ (* 0.5 parent-score) ; JW: changed the following so that score-match considers the ; full-text of the hyperlink, which is from the ... , ; instead of just the text of the hyperlink, which is just the ; ... The reason is that sometimes the ... is just the name ; of another gif file, so has no content in itself. ; Hopefully, the path names in has *some* ; information in these cases. ; (- (* 0.5 (score-match want-strings help-strings ; (hyperlink-text hyperlink))))))) (- (* 0.5 (score-match want-strings help-strings (hyperlink-full-text hyperlink))))))) (defun score-match (want-strings help-strings text) "Score how well want and need strings match text by weighting, in order, number of want strings found, number of help strings found, number of times want strings are found, and number of times help strings are found" (multiple-value-bind (want-count want-total) (score-strings want-strings text) (multiple-value-bind (help-count help-total) (score-strings help-strings text) (+ (* 1000 want-count) (* 100 help-count) (* 10 want-total) help-total)))) (defun score-strings (strings text) "Returns two values: The number of the list of strings found in the text and the total number of occurences of these strings" (let ((num-found 0) (total-occurences 0)) (dolist (string strings (values num-found total-occurences)) (let ((count (count-occurences string text))) (incf total-occurences count) (unless (zerop count) (incf num-found)))))) (defun count-occurences (string text) "Counts the number of times the string is found in text. If string is capitalized, match is case-sensitive, otherwise it is case insenstive. In either case white-space characters match any other white-space character." (let ((pos 0)(count 0) (test (if (upper-case-p (char string 0)) #'char=-white #'char-equal-white))) (loop (if (setf pos (string-search string text :start pos :test test)) (progn (incf count) (incf pos)) (return count))))) (defun char=-white (char1 char2) "Case sensitive character equality where white space is white space" (cond ((char= char1 char2) t) ((member char1 *white-chars*) (member char2 *white-chars*)))) (defun char-equal-white (char1 char2) "Case insensitive character equality where white space is white space" (cond ((char-equal char1 char2) t) ((member char1 *white-chars*) (member char2 *white-chars*)))) ;;;; Utilities for downloading web pages, pulling out hyperlinks, ;;;; and generating web-page and hyperlink states with successor ;;;; functions to support automated web searching. ;;;------------------------------ ;;; Successor generator ;;;------------------------------ ;;; Main code for generating successors of web states. Web states ;;; are pages or hyperlinks, web-pages generating hyperlinks and ;;; each hyperlink generates one web page. The path of links ;;; traversed is maintained in the parent-page slots of links ;;; and parent-hyperlink slots of web-pages as they are generated (defun follow-hyperlink (hyperlink) "Download and create web-page by following the given link" (make-web-page :url (hyperlink-url hyperlink) :text (get-url-to-string (hyperlink-url hyperlink)) :parent-hyperlink hyperlink)) (defun web-page-hyperlinks (web-page) "Generate and return a list of all hyperlinks on web page" (let ((hyperlinks (extract-complete-hyperlinks (web-page-text web-page) (web-page-url web-page)))) ;; Remove any links that just link back to parent states of this web page (setf hyperlinks (delete-if #'(lambda (link) (ancestor-url (hyperlink-url link) web-page)) hyperlinks)) ;; Record the parent page for these links (dolist (link hyperlinks hyperlinks) (format t "~% Successor hyperlink: ~A " (hyperlink-url link)) ; JW added (setf (hyperlink-parent-page link) web-page)))) (defun ancestor-url (url state) "Return T is this URL was along the path used to generate this state" (cond ((null state) nil) ((web-page-p state) (or (equal (web-page-url state) url) (ancestor-url url (web-page-parent-hyperlink state)))) ((hyperlink-p state) (or (equal (hyperlink-url state) url) (ancestor-url url (hyperlink-parent-page state)))))) (defun web-successors (state) "Generate successors of a web state. States are either web pages or hyperlinks. The successor of a web page is all the hyperlinks it contains. The sole sucessor of a hyperlink is the web page it points to" (cond ((web-page-p state) (format t "~&;; Generating successors of web-page ~A (score ~A)" (web-page-url state) (web-page-score state)) (web-page-hyperlinks state)) ((hyperlink-p state) (format t "~&;; Following link to ~A (score ~A)" (hyperlink-url state) (hyperlink-score state)) (list (follow-hyperlink state))) (t (error "Illegal web state")))) ;;;------------------------------ ;;; Netscape interface ;;;------------------------------ (defun netscape-page (web-page) "Pop up a web page in Netscape remotely (assumes you already have Netscape running)" (shell (format nil "netscape -remote 'openURL(~A)'" (web-page-url web-page)))) ;;;------------------------------ ;;; Basic string input and search ;;;------------------------------ (defmacro file-length2 (file-name) "Returns the length of a file in characters" `(with-open-file (input ,file-name :direction :input) (file-length input))) (defun read-file-into-string (file-name) "Reads the contents of a file into a string and returns it" (let* ((length (file-length2 file-name)) (string (make-string length))) (with-open-file (input file-name :direction :input) (dotimes (i length string) (setf (char string i) (read-char input)))))) (defun find-char-not-escaped (char string &key (test #'char=) (start 0)) "Searches for first occurence of char in string that is not escaped, i.e. preceded by a '\'. Char can also be a list of chars and search for ANY of them. Returns position where found or NIL." (loop for i from start below (length string) do (if (and (if (listp char) (member (char string i) char :test test) (funcall test (char string i) char)) (or (= i start) (not (char= (char string (1- i)) #\\)))) (return i)))) (defun find-not-char (char string &key (test #'char=) (start 0)) "Searches for first character in string that is not char (or member of char if it is a list. Returns position where found or NIL." (loop for i from start below (length string) do (unless (if (listp char) (member (char string i) char :test test) (funcall test (char string i) char)) (return i)))) (defun string-search (string text &key (test #'char=) (start 0)) "Searches for string in text (also a string), returning position of first character if found or NIL" (loop for i from start below (length text) do (if (string-prefix string text :start i :test test) (return i)))) (defun string-prefix (string text &key (test #'char=) (start 0)) "Returns T if string is a prefix of string text" (let ((len-text (length text))) (dotimes (i (length string) T) (if (and (< start len-text)(funcall test (char string i) (char text start))) (incf start) (return nil))))) (defun string-suffix (string text &key (test #'char=)) (let ((lt (length text)) (ls (length string))) (and (>= lt ls) (string-prefix string text :test test :start (- lt ls))))) ;;;--------------------------------- ;;; Parsing HTML commands ;;;--------------------------------- (defparameter *end-html-command-name* (cons #\> *white-chars*) "Characters that indicate the end of an HTML command name") (defun parse-html-command (string &key (start 0)) "String (from an HTML file) should start with a HTML command at position start. Returns a structure for the command and the first position in the string after the end of the command" (when (char= (char string start) #\<) ;; First find end position for command name (end-name) and position of first ;; character starting parameter list (pos) (let* ((end-name (find-char-not-escaped *end-html-command-name* string :start start)) (end-flag (char= (char string (1+ start)) #\/)) ; is it the end marker (e.g. ) parameters (pos (find-not-char *white-chars* string :start end-name)) new-pos param-name param-value) ;; Loop parsing parameter values for the command (e.g. href = "http://...") ;; until end of command ">" is found. (loop until (char= (char string pos) #\>) do (progn (setf new-pos (find-char-not-escaped #\= string :start pos)) (setf param-name (read-from-string (subseq string pos new-pos))) (setf pos (find-not-char *white-chars* string :start (1+ new-pos))) ; go to start of value (if (char= (char string pos) #\") ;; Value is a quoted string, find end of quote, and extract value. (progn (setf new-pos (find-char-not-escaped #\" string :start (1+ pos))) (setf param-value (subseq string (1+ pos) new-pos)) (incf new-pos)) ;; Else find end of value and extract it (progn (setf new-pos (find-char-not-escaped *end-html-command-name* string :start pos)) (setf param-value (let ((val (subseq string pos new-pos))) (if (find-char-not-escaped *punctuation-chars* val) val (read-from-string val)))))) (push (list param-name param-value) parameters) ; Save (param value) pair in parameter list (setf pos (find-not-char *white-chars* string :start new-pos)))) ; skip whitespace and go on (values (make-html-command :name (read-from-string (if end-flag (subseq string (+ start 2) end-name) (subseq string (+ start 1) end-name))) :end end-flag :parameters (nreverse parameters)) (1+ pos))))) ;;;------------------------------ ;;; Hyperlink extraction ;;;------------------------------ (defun extract-hyperlinks (string) "Extract all hyperlinks from text and return as a list" (let (links (pos 0)) (loop (multiple-value-bind (hyperlink end-pos) (find-next-hyperlink string :start pos) (cond (hyperlink (push hyperlink links)(setf pos end-pos)) (t (return (nreverse links)))))))) (defun find-next-hyperlink (string &key (start 0)) "Returns parsed hyperlink for first reference found in string from start position" ;; Find a " command (multiple-value-bind (html-command end-pos) (parse-html-command string :start start) ;; Find end of link position to locate link text (let (end-ref (href (assoc 'href (html-command-parameters html-command)))) ; extract URL from param list (if href (if (not (setf end-ref (string-search " ~A" url filename))) (defun get-url-to-string (url) "Download URL and put contents into a string" (get-url url "temp.html") (read-file-into-string "temp.html"))