>>> from dict_of_dicts import * >>> db {'rfranklin': {'born': 1920, 'notes': 'contributed to discovery of >>> DNA', 'surname': 'Franklin', 'died': 1957, 'forename': >>> 'Rosalind'}, 'rcarson': {'surname': 'Carson', 'author': ['Silent >>> Spring'], 'notes': 'raised awareness of effects of DDT', 'born': >>> 1907, 'forename': 'Rachel', 'died': 1964}, 'jgoodall': {'surname': >>> 'Goodall', 'author': ['In the Shadow of Man', 'The Chimpanzees of >>> Gombe'], 'notes': 'primate researcher', 'born': 1934, 'forename': >>> 'Jane', 'died': None}} A database - an organized collection of data. Like dictionaries, you look things up in a database according to a key. Say, an ID. We can use dictionaries for simple databases. This is something you could do if you program for your own purposes. keys: say, first initial, last name For each entry, we store different pieces of information. some simple strings: born, notes, two name fields. but we can also have lists as values. the list of boods for which the person is an author. >>> db.keys() dict_keys(['rfranklin', 'rcarson', 'jgoodall']) >>> db['rcarson'] {'surname': 'Carson', 'author': ['Silent Spring'], 'notes': 'raised awareness of effects of DDT', 'born': 1907, 'forename': 'Rachel', 'died': 1964} >>> # what books is jgoodall the author of? ... >>> db['jgoodall'] {'surname': 'Goodall', 'author': ['In the Shadow of Man', 'The Chimpanzees of Gombe'], 'notes': 'primate researcher', 'born': 1934, 'forename': 'Jane', 'died': None} >>> db['jgoodall']['author'] ['In the Shadow of Man', 'The Chimpanzees of Gombe'] >>> # when was she born? ... >>> db['jgoodall']['born'] 1934 >>> # note that there are different fields in different entries. rfranklin didn't author any books, .e.g. ... >>> # for the entire database, print all the books authored by people ... # If someone is not an author, print that ... >>> for k in db: ... if 'author' in db[k].keys(): ... print(k,db[k]["author"]) ... else: ... print(k,"is not an author") ... rfranklin is not an author rcarson ['Silent Spring'] jgoodall ['In the Shadow of Man', 'The Chimpanzees of Gombe'] >>> def pp(first,last,books): ... print(first,last,end="") ... if books: ... print(" is the author of the following books:") ... for b in books: ... print(b) ... else: ... print(" is not an author") ... This isn't quite right: >>> db {'rfranklin': {'born': 1920, 'notes': 'contributed to discovery of DNA', 'surname': 'Franklin', 'died': 1957, 'forename': 'Rosalind'}, 'rcarson': {'surname': 'Carson', 'author': ['Silent Spring'], 'notes': 'raised awareness of effects of DDT', 'born': 1907, 'forename': 'Rachel', 'died': 1964}, 'jgoodall': {'surname': 'Goodall', 'author': ['In the Shadow of Man', 'The Chimpanzees of Gombe'], 'notes': 'primate researcher', 'born': 1934, 'forename': 'Jane', 'died': None}} >>> for k in db: ... pp(db[k]['forename'],db[k]['surname'],db[k]['author']) ... Traceback (most recent call last): File "", line 2, in KeyError: 'author' >>> for k in db: ... if 'author' in db[k]: ... pp(db[k]['forename'],db[k]['surname'],db[k]['author']) ... else: ... pp(db[k]['forename'],db[k]['surname'],[]) ... Rosalind Franklin is not an author Rachel Carson is the author of the following books: Silent Spring Jane Goodall is the author of the following books: In the Shadow of Man The Chimpanzees of Gombe [db.py] ==== # The probabilities of detecting certain types of subatonic particles particles = {'neutron':0.55,'proton':0.21,'meson':0.03,'muon':0.07,'neutrino':0.14} Return the particle that is the least probable >>> from particles import * >>> particles {'muon': 0.07, 'neutrino': 0.14, 'meson': 0.03, 'proton': 0.21, >>> 'neutron': 0.55} >>> for k in particles: ... print(k) ... muon neutrino meson proton neutron >>> for k in particles: ... print(particles[k]) ... 0.07 0.14 0.03 0.21 0.55 >>> #keep track of the lowest value found so far. ... #if you find a lower value, remember that one ... >>> least = 0.07 >>> #0.14 is higher, so don't change it ... >>> least = 0.03 # since that's a lower value >>> >>> #0.21 and 0.55 are high, so we don't change least ... >>> least 0.03 >>> #This is our answer ... >>> #algorithm? ... >>> #algorithm? ... >>> for p in particles: ... if particles[p] < least: ... least = particles[p] ... >>> # In a general function, what should least be set to? ... >>> # A number that is HIGHER than any number we will see! ... >>> #Then, we are sure it's value will be updated starting ... #with the first number tested ... >>> #Since these are probs, they are between 0 and 1. ... >>> least = 2 >>> for p in particles: ... if particles[p] < least: ... least = particles[p] ... >>> least 0.03 >>> >>> particles {'muon': 0.07, 'neutrino': 0.14, 'meson': 0.03, 'proton': 0.21, 'neutron': 0.55} >>> Trace: least 2 0.07 0.03 compare: particles['muon'] < 2? yes compare: particles['neutrino'] < 0.07? no compare: particles['meson'] < 0.07? yes ...none of the remaining two are less, so we finish with least == 0.03 BUT: we want to know the particle that is the least probable, not the least probable value! >>> leastval = 2 >>> leastname = "" >>> for p in particles: ... if particles[p] < leastval: ... leastval = particles[p] ... leastname = p ... >>> leastval 0.03 >>> leastname 'meson' >>> [subatomic.py] ==== OK now that you've had more practice with dictionaries, let's deconstruct wannabe.py!! >>> # wannabe ... >>> # first, let's work with some mechanics before thinking of the 'big picture' ... >>> #tuples are like lists, but you cannot change them. ... # they are immutable ... >>> l = [0,1,2] >>> l[0] = 44 >>> l [44, 1, 2] >>> t = (0,1,2) >>> t[0] = 44 Traceback (most recent call last): File "", line 1, in TypeError: 'tuple' object does not support item assignment >>> #you access them the same way ... >>> t[1] 1 >>> t[2] 2 >>> # you can use + to create a new tuple ... >>> t + t (0, 1, 2, 0, 1, 2) >>> t (0, 1, 2) >>> newT = t + t >>> newT (0, 1, 2, 0, 1, 2) >>> (55,66,77) + (22,33) (55, 66, 77, 22, 33) >>> #the weird thing about tuples in Python is that ... #a tuple with a single value has an extra , ... >>> t = (55,) >>> type(t) >>> # What does this return? >>> type((55)) >>> type((55,)) >>> type((55)) >>> type((55,)) OK? >>> # here is a tuple of empty strings ... >>> x = ('','','') >>> x[0] '' >>> x[1] '' >>> x[2] '' >>> len(x) 3 >>> len(x[0]) 0 >>> len(x[1]) 0 >>> >>> #Suppose we want a tuple of N 1s >>> N = 3 >>> #algorithm used in wannabe.py: ... # initialize x to (1,) - a tuple containing a single 1 ... # x = x + (1,) ... # repeat that line for N-1 times >>> x = (1,) >>> x = x + (1,) >>> x (1, 1) >>> x = x + (1,) >>> x (1, 1, 1) >>> # See, we repeated the + line 2 times, N - 1 times. >>> N = 3 >>> context = ('',) >>> context = context + ('',) >>> context ('', '') >>> context = context + ('',) >>> context ('', '', '') >>> again, we repeated the 'context' line two times. >>> # now, let's use range to determine how many times to do the + line ... >>> range(N) range(0, 3) >>> for i in range(N): ... print(i) ... 0 1 2 >>> # That's too many times, since we initialized x to ('',) ... >>> for i in range(N-1): ... print(i) ... 0 1 >>> # That's correct ... >>> # instead of N, the program uses 'context_length' ... >>> context_length = 3 >>> for i in range(context_length-1): ... print(i) ... 0 1 >>> # now, let's use range to determine how many times to do the + line ... >>> range(N) range(0, 3) >>> for i in range(N): ... print(i) ... 0 1 2 >>> # That's too many times, since we initialized x to ('',) ... >>> for i in range(N-1): ... print(i) ... 0 1 >>> # That's correct ... >>> # instead of N, the program uses 'context_length' ... >>> context_length = 3 >>> for i in range(context_length-1): ... print(i) ... 0 1 >>> # now, let's use range to determine how many times to do the + line ... >>> range(N) range(0, 3) >>> for i in range(N): ... print(i) ... 0 1 2 >>> # That's too many times, since we initialized x to ('',) ... >>> for i in range(N-1): ... print(i) ... 0 1 >>> # That's correct ... >>> # instead of N, the program uses 'context_length' ... >>> context_length = 3 >>> for i in range(context_length-1): ... print(i) ... 0 1 >>> #so, here's the code! ... >>> context = ('',) >>> for i in range(context_length-1): ... context = context + ('',) ... >>> context ('', '', '') >>> #In general, after this look, context will contain context_length empty strings! ... >>> Go to: [wannbeWithPrints.py] Look at the two places, with my print statements Call it with context_length changing. 2, then 5, etc. >>> # this is equivalent >>> context = () >>> type(context) >>> for i in range(context_length): ... context = context + ('',) ... >>> context ('', '', '') >>> context_length 3 >>> #actually, Python let's you do this ... >>> context = ('',) * context_length >>> context ('', '', '') >>> context_length = 6 >>> context = ('',) * context_length >>> context ('', '', '', '', '', '') >>> OK - so initializing context should be very clear now! Let's build the dictionary, with an easy example [newsimple.txt] Change [wannabeWithPrints.py] - comment out =>1 prefix_length back to 1 uncomment ns = ... newsimple.txt and =>2 [newsimple.txt] one two three four one two three four one two three four one two three four one two three four one two three four =>2 {('one',): ['two', 'two', 'two', 'two', 'two', 'two'], ('four',): ['one', 'one', 'one', 'one', 'one'], ('two',): ['three', 'three', 'three', 'three', 'three', 'three'], ('',): ['one'], ('three',): ['four', 'four', 'four', 'four', 'four', 'four']} 'one' is followed by 'two' 6 times 'four' is following by 'one' 5 times (so, the code handles the new lines!) and so on. Now: we have the special case ('',):['one'] ('',) is the initial value of context, right? 'one' is the first word in the text context of empty strings - value the first word in the text. to get things started. Now, run it with prefix_length 2 one two three four one two three four one two three four one two three four one two three four one two three four =>2 {('', ''): ['one'], ('two', 'three'): ['four', 'four', 'four', 'four', 'four', 'four'], ('', 'one'): ['two'], ('three', 'four'): ['one', 'one', 'one', 'one', 'one'], ('one', 'two'): ['three', 'three', 'three', 'three', 'three', 'three'], ('four', 'one'): ['two', 'two', 'two', 'two', 'two']} context length is 2. so, we are remembering what comes after each 2-sequence of words (called a 'bigram' in natural language processing) special case: ('', ''): ['one'] initial context:[first word] the window moves one word at a time. ('', 'one'): ['two'] this 'two' is the first one in the text nothing, one -- a two follows it! ('one','two'): ['three', 'three', 'three', 'three', 'three', 'three'] ('four', 'one'): ['two', 'two', 'two', 'two', 'two'] Now, change context_length to 3: =>2 {('one', 'two', 'three'): ['four', 'four', 'four', 'four', 'four', 'four'], ('two', 'three', 'four'): ['one', 'one', 'one', 'one', 'one'], ('', 'one', 'two'): ['three'], ('', '', ''): ['one'], ('three', 'four', 'one'): ['two', 'two', 'two', 'two', 'two'], ('', '', 'one'): ['two'], ('four', 'one', 'two'): ['three', 'three', 'three', 'three', 'three']} OK? See what we are building? look at the code uncomment =>3 Run the code - it's ok, there's alot of output; we'll just look at the first line's output context is intially ('','') =>3 word_list is ['one', 'two', 'three', 'four', 'one', 'two', 'three', 'four'] =>3 word is one =>3 we just updated context to ('', 'one') =>3 word is two =>3 we just updated context to ('one', 'two') =>3 word is three =>3 we just updated context to ('two', 'three') =>3 word is four =>3 we just updated context to ('three', 'four') =>3 word is one =>3 we just updated context to ('four', 'one') =>3 word is two =>3 we just updated context to ('one', 'two') =>3 word is three =>3 we just updated context to ('two', 'three') =>3 word is four =>3 we just updated context to ('three', 'four') etc. back to the shell for: context = context[1:] + (word,) >>> context = ('one','two') >>> word = 'three' >>> context[1:] ('two',) >>> (word,) ('three',) >>> context[1:] + (word,) ('two', 'three') >>> OK: now, the if-else in build-dict Shell: >>> # now suppose it's later in processing, context is the same ... # but word is "cow" ... >>> context ('one', 'two') >>> word = "cow" >>> context in word_dict True >>> word_dict[context] ['three'] >>> word_dict[context].append(word) >>> word_dict {('one', 'two'): ['three', 'cow']} >>> Into the program: =>4 uncomment reminder: who them [newsimple.txt] one two three four one two three four one two three four one two three four one two three four one two three four NOW - we are going to get a LOT of output. That's ok, we'll scroll up to the top *** so they see how word_dict is evolving =>4 =>4 current context: ('', '') =>4 next word: one =>4 word_dict is now: {('', ''): ['one']} =>4 =>4 current context: ('', 'one') =>4 next word: two =>4 word_dict is now: {('', ''): ['one'], ('', 'one'): ['two']} =>4 =>4 current context: ('one', 'two') =>4 next word: three =>4 word_dict is now: {('', ''): ['one'], ('', 'one'): ['two'], ('one', 'two'): ['three']} =>4 =>4 current context: ('two', 'three') =>4 next word: four =>4 word_dict is now: {('', ''): ['one'], ('two', 'three'): ['four'], ('', 'one'): ['two'], ('one', 'two'): ['three']} =>4 =>4 current context: ('three', 'four') =>4 next word: one =>4 word_dict is now: {('three', 'four'): ['one'], ('', ''): ['one'], ('two', 'three'): ['four'], ('', 'one'): ['two'], ('one', 'two'): ['three']} =>4 =>4 current context: ('four', 'one') =>4 next word: two =>4 word_dict is now: {('', ''): ['one'], ('two', 'three'): ['four'], ('', 'one'): ['two'], ('three', 'four'): ['one'], ('one', 'two'): ['three'], ('four', 'one'): ['two']} =>4 AHA - finally, we see the same key again. =>4 current context: ('one', 'two') =>4 next word: three =>4 word_dict is now: {('', ''): ['one'], ('two', 'three'): ['four'], ('', 'one'): ['two'], ('three', 'four'): ['one'], ('one', 'two'): ['three', 'three'], ('four', 'one'): ['two']} =>4 =>4 current context: ('two', 'three') =>4 next word: four =>4 word_dict is now: {('', ''): ['one'], ('two', 'three'): ['four', 'four'], ('', 'one'): ['two'], ('three', 'four'): ['one'], ('one', 'two'): ['three', 'three'], ('four', 'one'): ['two']} =>4 =>4 current context: ('three', 'four') =>4 next word: one =>4 word_dict is now: {('', ''): ['one'], ('two', 'three'): ['four', 'four'], ('', 'one'): ['two'], ('three', 'four'): ['one', 'one'], ('one', 'two'): ['three', 'three'], ('four', 'one'): ['two']} =>4 =>4 current context: ('four', 'one') =>4 next word: two =>4 word_dict is now: {('', ''): ['one'], ('two', 'three'): ['four', 'four'], ('', 'one'): ['two'], ('three', 'four'): ['one', 'one'], ('one', 'two'): ['three', 'three'], ('four', 'one'): ['two', 'two']} =>4 Special case at the end: if the last 2 words aren't get in the dictionary, add them. with value [""] - nothing follows them (they are at the end!) Now, write_epic is straightforward. o analyzing and comparing alternative solutions for efficiency o Today, we will start thinking about algorithms and complexity o We'll the patterning finder program as an example to make some points. - We've been doing algorithm design since the first day - QU: what is an "algorithm?" A set of steps that accomplishes a task. - Algorithms don't have to be written in Python. Ikea bookshelves instructions: diagrams + English (sort of :-) recipe: English knitting pattern: a language known to knitters - a program is an algorithm written in a language a computer can interpret. - For many interesting problems, there are distinct *techniques* that all work - We're going to learn that sometimes, one of them is a bit faster than another, or one of them is MUCH faster than another. == The string matching problem == - string matching: Find all instances of a pattern within a string. - Application: finding binding sites on DNA - DNA is composed of sequences of 4 characters -- A, C, T, and G. - "binding sites" are short sequences of A, C, T, and G in the DNA - finding them is an important step to identifying how proteins are created in cells. - This is just an instance of the string matching. remember text.find(pattern,index) returns the first position where pattern occurs in text, starting at index remember text.find(pattern,index) returns the first position where pattern occurs in text, starting at index [stringmaters.py - the one using "find"] def get_indices(text, pattern): '''Return a list containing the indices where str pattern appears in str text. Overlapping instances of pattern should be counted. For example, get_indices("abcabcabc", "bc") --> [1, 4, 7] get_indices("bbbbbbb", "bb") --> [0, 1, 2, 3, 4, 5]''' # Accumulate the indices of occurrences here. indices = [] # The position of the next occurrence of pattern within text. index = text.find(pattern, 0) # Keep looking until there are no more occurrences. while index != -1: # Record this occurrence. indices.append(index) # Advance past this occurrence and look for the next one. index = text.find(pattern, index + 1) return indices get in the shell: >>> text = "abcabcabc" >>> pattern = "bc" >>> # answer should be [1,4,7] ... >>> help(str.find) Help on method_descriptor: find(...) S.find(sub[, start[, end]]) -> int Return the lowest index in S where patterning sub is found, such that sub is contained within s[start:end]. Optional arguments start and end are interpreted as in slice notation. Return -1 on failure. >>> index = text.find(pattern,0) >>> index 1 >>> indices = [] >>> indices.append(index) >>> # we found the pattern starting at 1. ... # Now, looks like in the string at 1+1 ... >>> index 1 >>> # we started at 2; the next 'bc' we found starts at 4 >>> index = text.find(pattern,index+1) >>> index 4 >>> indices.append(index) >>> indices [1, 4] >>> >>> index = text.find(pattern,index+1) >>> index 7 >>> #we looked in text, starting at 5; the next one we found was 7 ... >>> indices.append(index) >>> indices [1, 4, 7] >>> text 'abcabcabc' >>> # now look starting at 8 ... >>> index = text.find(pattern,index+1) >>> index -1 >>> - It relies heavily on str.find. - Let's assume we don't have "find"! - Exercise: With the person next to you, find instances of pattern in text: Text: CTACAATATATCGTATCATATCC Pattern: ATATC How did you do it? top down design - start in lose English, and then refine it into code start at the left of text see if the pattern matches there if it does, save it. go to the second char of text, and see if the pattern matches there. if it does, save it. continue through to the end of text === Text: CTACAATATATCGTATCATATCC Pattern: ATATC "CTACAATATATCGTATCATATCC","ATATC"), "sb",[7,17] for each character in Text: for each character in Pattern: if the characters at the appropriate positions don't match, not a match #now, we've gone through the entire #pattern, and we know if we have a match or not if a match: add the match to the list === let's use a boolean! for each character in Text: for each character in Pattern: if the characters at the appropriate positions don't match, match = False <======**** if match: <======**** add the match to the list where do we make it true? for each character in Text: match = True <======**** start over for each potential match for each character in Pattern: if the characters at appropriate positions don't match, match = False if match: # you've gotten throug the entire pattern, and match # survives to be true! add the match to the list Trace it on the board. Use an arrow to point to the current characters good to trace your high level algorithms - if it doesn't work when it is this vague, it won't work later! text = A T A T A T Pattern = A T For me: this is implemented in: print(basic_stringmatch("ATATAT","AT"), "sb", [0,2,4]) - Need to be precise at what we mean by "appropriate text character" For index from 0 to the last position we could have a match: For p_index from 0 to the length of Pattern: If the Pattern[p_index] doesn't match Text[index + p_index] Not a match. If a match: Add the match to the list So, e.g., "CTACAATATATCGTATCATATCC" "ATAT" suppose we've gotten up to index = 7 index 7 p_index 0 1 2 3 index + p_index 7 8 9 10 index 8 p_index 0 1 2 3 index + p_index 8 9 10 11 this is not a match. we will keep going. What's the last position we could have a match? The pattern is 4 characters. So, the last place we could have a match is with the character that is 4 positions from the end. In our couse, the ATCC at the end. We'll have to calculate that. When you have to do these things, use an example. text = CTACAATC pattern = ATT *** For the outside for-loop: "CTACAATC" "ATT" ^ The last possible match what position is that? len("CTACAATC") is 8 positions are 0 through 7 the last three are 5, 6, 7 so, we want 0 through len(text) - len(pattern) so, we want 0 through 8 - 3 so, we want 0 through 5 ;; check, that works if we use range? Remember, range(0,5) is [0,1,2,3,4] we would be one short! for index in range(len(text) - len(pattern) + 1) just reason it step by step to figure this out, using an example. # (Does not use str.find.) def basic_stringmatch(text, pattern): '''Find all instances of the string pattern in the string text. Return a list containing the starting indices of all matches. This version iterates through text one character at a time, checking if the pattern matches starting at that character. It checks the entire pattern at each character.''' matches = [] # Iterate through text for index in range(len(text) - len(pattern) + 1): # Check if the pattern matches starting at this character match = True for p_index in range(len(pattern)): if text[index + p_index] != pattern[p_index]: match = False if match: matches.append(index) return matches == Comparing the two methods == - We wrote it with find, and then "by hand" - ours is much slower. [timing.py] there are sophisticated algorithms that don't actually have to go through the text one character at a time one is called the knuth, morris, pratt the built in "find" is written in a really smart way. We won't see the algorithm Python uses. But we can easily do better Let's stay in the inner loop, only as long as we are matching the pattern. Why continue on, after you know the match at that position has failed? One way to do this: def early_exit_stringmatch(text, pattern): '''Find all instances of the string pattern in the string text. Return a list containing the starting indices of all matches. This version iterates through text one character at a time, checking if the pattern matches starting at that character. It stops checking the pattern when a mismatch is detected.''' matches = [] # Iterate through text for index in range(len(text) - len(pattern) + 1): # Check if the pattern matches starting at this character p_index = 0 while p_index < len(pattern) and \ text[index + p_index] == pattern[p_index]: p_index = p_index + 1 # The pattern matches if the previous loop iterated through # the entire pattern. if p_index == len(pattern): matches.append(index) return matches