/* Basic sample C++ solution for robosurfer-ext.lisp Tested only on the CS public Linux machines. This program uses the C++ Standard Template Library (STL) extensively; see http://www.cs.rpi.edu/~musser/stl.html for more info. 970204 James A. Bednar */ #include #include #include #include /* Most of the routines here have direct equivalents in robosurfer.lisp: Lisp C++ -------------------------- ----------------------- char= char_eq char-equal char_equal char=-white char_eq_white char-equal-white char_equal_white read-file-into-string read_file_into_string count-occurences count_occurences score-strings score_strings score-match score_match compute-web-page-score compute_web_page_score compute-hyperlink-score compute_hyperlink_score */ /******************************************************************************/ /* Utility functions */ /******************************************************************************/ /* Type for predicates that compare characters */ typedef bool (*binary_char_predicate)(const char char1, const char char2); /* Case sensitive character equality */ inline bool char_eq(const char char1, const char char2) { return (char1 == char2); } /* Case insensitive character equality */ inline bool char_equal(const char char1, const char char2) { return (tolower(char1) == tolower(char2)); } /* True if both characters are whitespace */ inline bool char_both_spaces(const char char1, const char char2) { return (isspace(char1) && isspace(char2)); } /* Case sensitive character equality where white space is white space */ inline bool char_eq_white(const char char1, const char char2) { return (char_eq(char1,char2) || char_both_spaces(char1,char2)); } /* Case insensitive character equality where white space is white space */ inline bool char_equal_white(const char char1, const char char2) { return (char_equal(char1,char2) || char_both_spaces(char1,char2)); } /* Return the given range of a string */ string substring(string::const_iterator start, string::const_iterator end) { string text(end-start,' '); copy(start,end,text.begin()); return text; } /* Reads the contents of a file into a string and returns it */ string read_file_into_string(const string& filename) { ifstream file(filename.c_str()); if (!file) { cerr << "Could not find file " << filename << endl; exit(1); } char c; string text; while (file.get(c)) text += c; return text; } /* Reads a set of strings from the given file into an array of strings. Breaks at the given delimiter, and trims off the last character of the delimiter from each string. */ vector read_strings( const string& filename, const string& delimiter) { string text = read_file_into_string(filename); string::iterator start = text.begin(); string::iterator end; vector strings; while (start != text.end()) { end = search(start,text.end(), delimiter.begin(),delimiter.end(),char_equal); strings.push_back(substring(start,end+delimiter.length()-1)); start = end + delimiter.length(); } return strings; } /* Counts the number of times the string is found in text. If string is capitalized, match is case-sensitive, otherwise it is case insenstive. In either case white-space characters match any other white-space character. */ int count_occurences( const string& searchitem, const string& text) { binary_char_predicate test = isupper(searchitem[0]) ? char_eq_white : char_equal_white; int count = 0; for ( string::const_iterator pos = search(text.begin(),text.end(),searchitem.begin(),searchitem.end(),test); pos != text.end(); pos = search(++pos, text.end(),searchitem.begin(),searchitem.end(),test) ) count++; return count; } /******************************************************************************/ /* Heuristic computation */ /******************************************************************************/ /* Computes two values: The number of the list of strings found in the text and the total number of occurences of these strings */ void score_strings( const string& text, const vector& strings, int& num_found, int& total_occurences) { num_found = 0; total_occurences = 0; for (int i=0; i < strings.size(); i++) { int count = count_occurences(strings[i],text); total_occurences += count; if (count) num_found++; } } /* Score how well want and need strings match text by weighting, in order, number of want strings found, number of help strings found, number of times want strings are found, and number of times help strings are found */ float score_match (const vector& want_strings, const vector& help_strings, const string& text) { int want_count,want_total,help_count,help_total; score_strings( text, want_strings, want_count,want_total); score_strings( text, help_strings, help_count,help_total); return 1000*want_count + 100*help_count + 10*want_total + help_total; } /* Score a web page based on how well its want and help strings are represented in the text of the page. Make it negative since search code sorts low values first (by cost rather than benefit) */ inline float compute_web_page_score(const string& web_page, const vector& want_strings, const vector& help_strings) { return -score_match(want_strings,help_strings,web_page); } /* Parse the text out of the given link, assuming no nested HTML commands within the anchor */ inline const string hyperlink_text(const string& link) { string end_marker=""; return substring(find(link.begin(),link.end(),'>')+1, search(link.begin(),link.end(), end_marker.begin(),end_marker.end(), char_equal_white)); } /* Score a hyperlink as 1/2 from the score of how well its text matches the want and help strings and 1/2 from the score of its parent page in order to include more surrounding context */ inline float compute_hyperlink_score(const string& hyperlink, float parent_page_score, const vector& want_strings, const vector& help_strings) { return -(-0.5*parent_page_score + 0.5*score_match(want_strings,help_strings,hyperlink_text(hyperlink))); } main(int argc, char **argv) { /* Read files */ string page=read_file_into_string("page.html"); vector want_strings=read_strings("want_strings","\n"); vector help_strings=read_strings("help_strings","\n"); /* Compute score for page */ float page_score = compute_web_page_score(page,want_strings,help_strings); /* Write out page score if called on a page */ if (!strcmp(argv[0],"scorepage")) { ofstream scorefile("page_score"); if (!scorefile) {cerr << "Could not create page_score file\n"; exit(1); } scorefile << page_score << endl; } /* Compute and write link scores if called on a list of links */ else if (!strcmp(argv[0],"scorelinks")) { vector links=read_strings("links.html","\n"); ofstream link_scores("link_scores"); if (!link_scores) { cerr << "Could not create link_scores file\n"; exit(1); } for (int i=0; i < links.size(); i++) link_scores << compute_hyperlink_score(links[i],page_score,want_strings,help_strings) << endl; } }