;;;; System that uses heuristic search to search the web, starting at ;;;; a given initial page and following links trying to find a page that ;;;; contains a given set of key words. Uses beam search with a heuristic ;;;; that in addition to the goal strings considers a list of "help strings" ;;;; to guide the search. The function web-search is the main interface. ;;;; Version that allows external executable to be used to score pages. ;;;; Communicates with external executable commands via files to score pages ;;;; and links. All files are written and read from the current working ;;;; directory. ;;;; When a page is downloaded it is scored by a call to the shell command ;;;; "scorepage" which should use the following INPUT FILES: ;;;; ;;;; want_strings file of want strings one per line ;;;; help_strings file of help strings one per line ;;;; page.html html file to be scored ;;;; link_path file of link anchor texts one per line, giving the ;;;; path of links from the start page to the given page ;;;; ;;;; and generate the following OUTPUT FILE: ;;;; ;;;; page_score file with one number for the score of the page ;;;; When links are generated from a page they are all scored at once ;;;; by a call to the shell command "scorelinks" which should use the ;;;; INPUT FILES: ;;;; ;;;; want_strings file of want strings one per line ;;;; help_strings file of help strings one per line ;;;; page.html html file giving surrounding context ;;;; link_path file of link anchor texts one per line, giving the ;;;; path of links from the start page to the given page ;;;; links.html file of N links ... each one starting a new line ;;;; and in the order found in the file. If the text of the link ;;;; contains newline's in page.html they are also in links.html ;;;; ;;;; and should generate the OUTPUT FILE: ;;;; ;;;; link_scores file of N numbers, one per line, which are the scores of ;;;; the links in links.html in order ;;;; ;;;; If the correct form of output files are not generated, the Lisp program ;;;; will complain with an appropriate error. ;;;------------------------------ ;;; Basic data structures ;;;------------------------------ (defstruct (web-page (:print-function print-web-page)) url ; URL string for page text ; String for text on page score ; Heuristic evaluation score parent-hyperlink) ; Parent link that led to this page in search (defun print-web-page (page stream depth) "Print only URL for a web page" (declare (ignore depth)) (format stream "" (web-page-url page))) (defstruct hyperlink url ; string for URL parameters ; alist of other (parameter value) other than (href URL) text ; string of text in the hyperlink full-text ; complete text of link .... parent-page ; web page this link is in score) ; heuristic evaluation of goodness of following link (defstruct url host ; Host name string directory ; Directory name string file) ; File name string (defstruct html-command name ; Name of command, e.g. "a", "p", etc. end ; Is it the end marker for command, e.g. "/a" parameters) ; Alist of parameters and values ;;;------------------------------ ;;; Special character lists ;;;------------------------------ (defparameter *white-chars* '(#\Newline #\Space #\Tab)) (defparameter *punctuation-chars* '(#\! #\@ #\# #\% #\$ #\% #\^ #\& #\* #\( #\) #\- #\_ #\{ #\} #\[ #\] #\: #\; #\" #\' #\, #\. #\? #\/ #\+ #\= #\~ #\` #\< #\> #\| #\\ #\Newline)) ;;;------------------------------ ;;; Search interface ;;;------------------------------ (defun web-search (start-url want-strings help-strings beam-width) "Conduct the web search using beam search with the appropriate goal, successor, and cost functions (see assignment definition)" (write-strings-to-file want-strings "want_strings") (write-strings-to-file help-strings "help_strings") (let ((goal-page (beam-search (initialize-web-page start-url want-strings help-strings) #'(lambda (state) (find-all-strings-page want-strings state)) #'web-successors #'(lambda (state) (score-state state want-strings help-strings)) beam-width))) (when goal-page ;; Print solution path if tracing (format t "~&~%Solution Path: ~&~{~&~A~}" (web-path goal-page)) goal-page))) (defun initialize-web-page (url want-strings help-strings) (declare (ignore want-strings help-strings)) "Create an initial web-page for this URL and score it according to the presence of the want and help strings" (let ((web-page (make-web-page :url url :text (get-url-to-string url)))) (setf (web-page-score web-page) (score-page-external web-page)) web-page)) (defun web-path (state) "Return a list of web-pages and hyperlinks specifying the path from the initial start page to this state" (cond ((null state) nil) ((web-page-p state) (append (web-path (web-page-parent-hyperlink state)) (list (web-page-url state)))) ((hyperlink-p state) (append (web-path (hyperlink-parent-page state)) ;; Include link text for links but eliminate line breaks and leading white-space (list (format nil " Follow link: \"~A\" to" (string-trim *white-chars* (substitute #\Space #\Newline (hyperlink-text state))))))) (t (error "Illegal web state")))) (defun find-all-strings-page (strings state) "Return T iff the state is a web page containing all of the strings in the given list." (and (web-page-p state) (every #'(lambda (string) (string-search string (web-page-text state) :test (if (upper-case-p (char string 0)) #'char=-white #'char-equal-white))) strings))) ;;;------------------------------ ;;; Basic search code ;;;------------------------------ ;;; This code implements basic beam search taken from ;;; Peter Norvig, Paradigms of Artificial Intelligence Programming: Case Studies in Common Lisp, ;;; Morgan Kaufman Publishers, San Fancisco, CA, 1992. (defun beam-search (start goal-p successors cost-fn beam-width) "Search highest scoring states first until goal is reached, but never consider more than beam-width states at a time." (tree-search (list start) goal-p successors #'(lambda (old new) (let ((sorted (funcall (sorter cost-fn) old new))) (if (> beam-width (length sorted)) sorted (subseq sorted 0 beam-width)))))) (defun tree-search (states goal-p successors combiner) "Find a state that satisfies goal-p. Start with states, and search according to successors and combiner." (cond ((null states) nil) ((funcall goal-p (first states)) (first states)) (t (tree-search (funcall combiner (funcall successors (first states)) (rest states)) goal-p successors combiner)))) (defun sorter (cost-fn) "Return a combiner function that sorts according to cost-fn." #'(lambda (new old) (sort (append new old) #'< :key cost-fn))) ;;;------------------------------ ;;; Heuristic scoring ;;;------------------------------ (defun score-state (state want-strings help-strings) "Score a state based on if it is a web page or a hyperlink. Assumes scores have already been computed when state is created" (declare (ignore want-strings help-strings)) (cond ((web-page-p state) (web-page-score state)) ((hyperlink-p state) (hyperlink-score state)))) ;;;; Utilities for downloading web pages, pulling out hyperlinks, ;;;; and generating web-page and hyperlink states with successor ;;;; functions to support automated web searching. ;;;------------------------------ ;;; Successor generator ;;;------------------------------ ;;; Main code for generating successors of web states. Web states ;;; are pages or hyperlinks, web-pages generating hyperlinks and ;;; each hyperlink generates one web page. The path of links ;;; traversed is maintained in the parent-page slots of links ;;; and parent-hyperlink slots of web-pages as they are generated (defun follow-hyperlink (hyperlink) "Download and create web-page by following the given link" (let ((page (make-web-page :url (hyperlink-url hyperlink) :text (get-url-to-string (hyperlink-url hyperlink)) :parent-hyperlink hyperlink))) (setf (web-page-score page) (score-page-external page)) page)) (defun web-page-hyperlinks (web-page) "Generate and return a list of all hyperlinks on web page" (let ((hyperlinks (extract-complete-hyperlinks (web-page-text web-page) (web-page-url web-page)))) ;; Remove any links that just link back to parent states of this web page (setf hyperlinks (delete-if #'(lambda (link) (ancestor-url (hyperlink-url link) web-page)) hyperlinks)) ;; Record the parent page for these links (dolist (link hyperlinks) (setf (hyperlink-parent-page link) web-page)) (score-links-external hyperlinks web-page) hyperlinks)) (defun ancestor-url (url state) "Return T is this URL was along the path used to generate this state" (cond ((null state) nil) ((web-page-p state) (or (equal (web-page-url state) url) (ancestor-url url (web-page-parent-hyperlink state)))) ((hyperlink-p state) (or (equal (hyperlink-url state) url) (ancestor-url url (hyperlink-parent-page state)))))) (defun web-successors (state) "Generate successors of a web state. States are either web pages or hyperlinks. The successor of a web page is all the hyperlinks it contains. The sole sucessor of a hyperlink is the web page it points to" (cond ((web-page-p state) (format t "~&;; Generating Links from ~A (score ~A)" (web-page-url state) (web-page-score state)) (web-page-hyperlinks state)) ((hyperlink-p state) (format t "~&;; Following Link to ~A (score ~A)" (hyperlink-url state) (hyperlink-score state)) (list (follow-hyperlink state))) (t (error "Illegal web state")))) (defun read-file (file-name) (with-open-file (input file-name :direction :input) (read input nil nil))) (defun read-file-into-list (file-name) (let (result item) (with-open-file (input file-name :direction :input) (loop (setf item (read input nil :eof)) (when (eq item :eof) (return (nreverse result))) (push item result))))) (defun score-page-external (page) "Call unix executable to score page" ;;; Assumes page already exists in page.html file ;; Write out link path to the file "link_path" (write-strings-to-file (web-link-path page) "link_path") (shell "scorepage") ; run scoring routine ;; Read in score from page_score file (let ((score (read-file "page_score"))) (if (numberp score) score (error "page_score file does not contain a number")))) (defun web-link-path (state) "Return a list of hyperlink anchor texts specifying the path from the initial start page to this state" (cond ((null state) nil) ((web-page-p state) (web-link-path (web-page-parent-hyperlink state))) ((hyperlink-p state) (append (web-link-path (hyperlink-parent-page state)) (list (string-trim *white-chars* (substitute #\Space #\Newline (hyperlink-text state)))))) (t (error "Illegal web state")))) (defun score-links-external (hyperlinks web-page) (when hyperlinks (with-open-file (output "page.html" :direction :output :if-exists :supersede) (write-string (web-page-text web-page) output)) (write-strings-to-file (web-link-path web-page) "link_path") (with-open-file (output "links.html" :direction :output :if-exists :supersede) (dolist (hyperlink hyperlinks) (write-string (hyperlink-full-text hyperlink) output) (terpri output))) (shell "scorelinks") (let ((scores (read-file-into-list "link_scores"))) (unless (eq (length scores) (length hyperlinks)) (error "Wrong number of items in link_scores file")) (loop for hyperlink in hyperlinks as score in scores do (unless (numberp score) (error "link_scores file item not a number")) (setf (hyperlink-score hyperlink) score))))) ;;;------------------------------ ;;; Netscape interface ;;;------------------------------ (defun netscape-page (web-page) "Pop up a web page in Netscape remotely (assumes you already have Netscape running)" (shell (format nil "netscape -remote 'openURL(~A)'" (web-page-url web-page)))) ;;;------------------------------ ;;; Basic string input and search ;;;------------------------------ (defmacro file-length2 (file-name) "Returns the length of a file in characters" `(with-open-file (input ,file-name :direction :input) (file-length input))) (defun read-file-into-string (file-name) "Reads the contents of a file into a string and returns it" (let* ((length (file-length2 file-name)) (string (make-string length))) (with-open-file (input file-name :direction :input) (dotimes (i length string) (setf (char string i) (read-char input)))))) (defun write-strings-to-file (strings file-name) "Write strings to a file, one per line" (with-open-file (output file-name :direction :output :if-exists :supersede) (dolist (string strings) (write-string string output) (terpri output)))) (defun find-char-not-escaped (char string &key (test #'char=) (start 0)) "Searches for first occurence of char in string that is not escaped, i.e. preceded by a '\'. Char can also be a list of chars and search for ANY of them. Returns position where found or NIL." (loop for i from start below (length string) do (if (and (if (listp char) (member (char string i) char :test test) (funcall test (char string i) char)) (or (= i start) (not (char= (char string (1- i)) #\\)))) (return i)))) (defun find-not-char (char string &key (test #'char=) (start 0)) "Searches for first character in string that is not char (or member of char if it is a list. Returns position where found or NIL." (loop for i from start below (length string) do (unless (if (listp char) (member (char string i) char :test test) (funcall test (char string i) char)) (return i)))) (defun string-search (string text &key (test #'char=) (start 0)) "Searches for string in text (also a string), returning position of first character if found or NIL" (loop for i from start below (length text) do (if (string-prefix string text :start i :test test) (return i)))) (defun string-prefix (string text &key (test #'char=) (start 0)) "Returns T if string is a prefix of string text" (let ((len-text (length text))) (dotimes (i (length string) T) (if (and (< start len-text)(funcall test (char string i) (char text start))) (incf start) (return nil))))) (defun string-suffix (string text &key (test #'char=)) (let ((lt (length text)) (ls (length string))) (and (>= lt ls) (string-prefix string text :test test :start (- lt ls))))) (defun char=-white (char1 char2) "Case sensitive character equality where white space is white space" (cond ((char= char1 char2) t) ((member char1 *white-chars*) (member char2 *white-chars*)))) (defun char-equal-white (char1 char2) "Case insensitive character equality where white space is white space" (cond ((char-equal char1 char2) t) ((member char1 *white-chars*) (member char2 *white-chars*)))) ;;;--------------------------------- ;;; Parsing HTML commands ;;;--------------------------------- (defparameter *end-html-command-name* (cons #\> *white-chars*) "Characters that indicate the end of an HTML command name") (defun parse-html-command (string &key (start 0)) "String (from an HTML file) should start with a HTML command at position start. Returns a structure for the command and the first position in the string after the end of the command" (when (char= (char string start) #\<) ;; First find end position for command name (end-name) and position of first ;; character starting parameter list (pos) (let* ((end-name (find-char-not-escaped *end-html-command-name* string :start start)) (end-flag (char= (char string (1+ start)) #\/)) ; is it the end marker (e.g. ) parameters (pos (find-not-char *white-chars* string :start end-name)) new-pos param-name param-value) ;; Loop parsing parameter values for the command (e.g. href = "http://...") ;; until end of command ">" is found. (loop until (char= (char string pos) #\>) do (progn (setf new-pos (find-char-not-escaped #\= string :start pos)) (setf param-name (read-from-string (subseq string pos new-pos))) (setf pos (find-not-char *white-chars* string :start (1+ new-pos))) ; go to start of value (if (char= (char string pos) #\") ;; Value is a quoted string, find end of quote, and extract value. (progn (setf new-pos (find-char-not-escaped #\" string :start (1+ pos))) (setf param-value (subseq string (1+ pos) new-pos)) (incf new-pos)) ;; Else find end of value and extract it (progn (setf new-pos (find-char-not-escaped *end-html-command-name* string :start pos)) (setf param-value (let ((val (subseq string pos new-pos))) (if (find-char-not-escaped *punctuation-chars* val) val (read-from-string val)))))) (push (list param-name param-value) parameters) ; Save (param value) pair in parameter list (setf pos (find-not-char *white-chars* string :start new-pos)))) ; skip whitespace and go on (values (make-html-command :name (read-from-string (if end-flag (subseq string (+ start 2) end-name) (subseq string (+ start 1) end-name))) :end end-flag :parameters (nreverse parameters)) (1+ pos))))) ;;;------------------------------ ;;; Hyperlink extraction ;;;------------------------------ (defun extract-hyperlinks (string) "Extract all hyperlinks from text and return as a list" (let (links (pos 0)) (loop (multiple-value-bind (hyperlink end-pos) (find-next-hyperlink string :start pos) (cond (hyperlink (push hyperlink links)(setf pos end-pos)) (t (return (nreverse links)))))))) (defun find-next-hyperlink (string &key (start 0)) "Returns parsed hyperlink for first reference found in string from start position" ;; Find a " command (multiple-value-bind (html-command end-pos) (parse-html-command string :start start) ;; Find end of link position to locate link text (let ((end-ref (string-search " ~A" url filename))) (defun get-url-to-string (url) "Download URL and put contents into a string" (get-url url "page.html") (read-file-into-string "page.html"))