Index: branches/init1/pywikipedia/test.py |
— | — | @@ -0,0 +1,15 @@ |
| 2 | +# |
| 3 | +# Script to check language links for years A.D. |
| 4 | +# |
| 5 | +# (C) Rob W.W. Hooft, 2003 |
| 6 | +# Distribute under the terms of the GPL. |
| 7 | + |
| 8 | +import sys,wikipedia |
| 9 | + |
| 10 | +wikipedia.langs={'test':'test.wikipedia.org'} |
| 11 | + |
| 12 | +text=wikipedia.getPage('test','Robottest') |
| 13 | +text=text+'\nrobot was here\n' |
| 14 | +status,reason,data=wikipedia.putPage('test','Robottest',text) |
| 15 | +print status,reason |
| 16 | + |
Property changes on: branches/init1/pywikipedia/test.py |
___________________________________________________________________ |
Added: svn:keywords |
1 | 17 | + Author Date Id Revision |
Added: svn:eol-style |
2 | 18 | + native |
Index: branches/init1/pywikipedia/getlang.py |
— | — | @@ -0,0 +1,119 @@ |
| 2 | +# |
| 3 | +# Script to check language links for years A.D. |
| 4 | +# |
| 5 | +# (C) Rob W.W. Hooft, 2003 |
| 6 | +# Distribute under the terms of the GPL. |
| 7 | + |
| 8 | +import sys,copy,wikipedia |
| 9 | + |
| 10 | +# languages to check for missing links and modify |
| 11 | +mylangs = ('nl', ) |
| 12 | + |
| 13 | +# selection of years to check |
| 14 | +years = range(900,999+1) |
| 15 | +#years=[879] |
| 16 | + |
| 17 | +# Set to 1 to actually change the pages |
| 18 | +forreal = 1 |
| 19 | + |
| 20 | +# Summary used in the modification request |
| 21 | +wikipedia.setAction('Rob Hooft: automatic interwiki script for years') |
| 22 | + |
| 23 | +debug = 0 |
| 24 | + |
| 25 | +def findlangs(year): |
| 26 | + matrix={} |
| 27 | + text={} |
| 28 | + print year,"=", |
| 29 | + missing=0 |
| 30 | + for code in wikipedia.languages(mylangs): |
| 31 | + print code+":", ; sys.stdout.flush() |
| 32 | + try: |
| 33 | + t=wikipedia.getPage(code,year) |
| 34 | + except wikipedia.NoPage: |
| 35 | + if code in mylangs: |
| 36 | + missing+=1 |
| 37 | + if missing==len(mylangs): |
| 38 | + # None of the mylangs has this page. Doesn't make sense. |
| 39 | + print |
| 40 | + return None,None |
| 41 | + pass |
| 42 | + else: |
| 43 | + text[code]=t |
| 44 | + l=wikipedia.getLanguageLinks(t) |
| 45 | + l[code]=year # Add self-reference |
| 46 | + matrix[code]=l |
| 47 | + print |
| 48 | + return text,matrix |
| 49 | + |
| 50 | +def assemblelangmatrix(m): |
| 51 | + result={} |
| 52 | + for dum,line in m.iteritems(): |
| 53 | + for code,name in line.iteritems(): |
| 54 | + if not code in m: |
| 55 | + pass |
| 56 | + #print "WARNING: Ignore %s from %s; did not see actual page there"%(code,dum) |
| 57 | + elif code in result: |
| 58 | + if result[code]!=name: |
| 59 | + print "WARNING: Name %s is either %s or %s"%(code,result[code],name) |
| 60 | + else: |
| 61 | + result[code]=name |
| 62 | + return result |
| 63 | + |
| 64 | +def missingLanguages(m,line,thiscode): |
| 65 | + # Figure out whether any references in the assembled references mentioned in |
| 66 | + # line are missing from the language page referred by thiscode. |
| 67 | + result={} |
| 68 | + for code,name in line.iteritems(): |
| 69 | + if code==thiscode: |
| 70 | + pass |
| 71 | + elif code in m[thiscode]: |
| 72 | + pass |
| 73 | + else: |
| 74 | + result[code]=name |
| 75 | + for code,name in m[thiscode].iteritems(): |
| 76 | + if not code in line: |
| 77 | + print "WARNING: %s contains reference to unknown %s:%s"%(thiscode,code,name) |
| 78 | + elif line[code]!=name: |
| 79 | + print "WARNING: %s reference to %s is %s and not %s"%(thiscode,code,name,line[code]) |
| 80 | + return result |
| 81 | + |
| 82 | +def compareLanguages(old,new): |
| 83 | + removing=[] |
| 84 | + adding=[] |
| 85 | + for code,name in old.iteritems(): |
| 86 | + if not new.has_key(code): |
| 87 | + removing.append(code) |
| 88 | + for code,name in new.iteritems(): |
| 89 | + if not old.has_key(code): |
| 90 | + adding.append(code) |
| 91 | + s="" |
| 92 | + if adding: |
| 93 | + s=s+" Adding:"+",".join(adding) |
| 94 | + if removing: |
| 95 | + s=s+" Removing:"+",".join(removing) |
| 96 | + return s |
| 97 | + |
| 98 | +for year in years: |
| 99 | + text,m=findlangs(str(year)) |
| 100 | + if m is None: |
| 101 | + # None of the mylangs has this page |
| 102 | + continue |
| 103 | + proper=assemblelangmatrix(m) |
| 104 | + for mycode in mylangs: |
| 105 | + if mycode in m: # Page must be present in this language |
| 106 | + ml=copy.copy(proper) |
| 107 | + status=compareLanguages(m[mycode],ml) |
| 108 | + if status: |
| 109 | + print mycode,str(year),":",status |
| 110 | + del ml[mycode] |
| 111 | + s=wikipedia.interwikiFormat(ml) |
| 112 | + newtext=s+wikipedia.removeLanguageLinks(text[mycode]) |
| 113 | + if debug: |
| 114 | + print newtext |
| 115 | + if newtext!=text[mycode]: |
| 116 | + print "NOTE: Replacing %s: %s"%(mycode,s) |
| 117 | + if forreal: |
| 118 | + status,reason,data=wikipedia.putPage(mycode,str(year),newtext) |
| 119 | + if str(status)!='302': |
| 120 | + print status,reason |
Property changes on: branches/init1/pywikipedia/getlang.py |
___________________________________________________________________ |
Added: svn:keywords |
1 | 121 | + Author Date Id Revision |
Added: svn:eol-style |
2 | 122 | + native |
Index: branches/init1/pywikipedia/.cvsignore |
— | — | @@ -0,0 +1 @@ |
| 2 | +*.pyc |
Property changes on: branches/init1/pywikipedia/.cvsignore |
___________________________________________________________________ |
Added: svn:eol-style |
1 | 3 | + native |
Index: branches/init1/pywikipedia/wikipedia.py |
— | — | @@ -0,0 +1,161 @@ |
| 2 | +# Library to get and put pages on Wikipedia |
| 3 | +import urllib,re |
| 4 | + |
| 5 | +# known wikipedia languages |
| 6 | +langs = {'en':'www.wikipedia.org', |
| 7 | + 'pl':'pl.wikipedia.org', |
| 8 | + 'da':'da.wikipedia.org', |
| 9 | + 'sv':'sv.wikipedia.org', |
| 10 | + 'zh':'zh.wikipedia.org', |
| 11 | + 'eo':'eo.wikipedia.org', |
| 12 | + 'nl':'nl.wikipedia.org', |
| 13 | + 'de':'de.wikipedia.org', |
| 14 | + 'fr':'fr.wikipedia.org', |
| 15 | + 'es':'es.wikipedia.org', |
| 16 | + 'it':'it.wikipedia.com', |
| 17 | + 'no':'no.wikipedia.com', |
| 18 | + 'pt':'pt.wikipedia.com', |
| 19 | + 'af':'af.wikipedia.com', |
| 20 | + 'fy':'fy.wikipedia.com', |
| 21 | + 'la':'la.wikipedia.com', |
| 22 | + 'ca':'ca.wikipedia.com', |
| 23 | + 'fi':'fi.wikipedia.com', |
| 24 | + 'ia':'ia.wikipedia.com', |
| 25 | + 'et':'et.wikipedia.com', |
| 26 | + 'cs':'cs.wikipedia.org', |
| 27 | + } |
| 28 | + |
| 29 | +action = 'Rob Hooft - Wikipedia python library' |
| 30 | + |
| 31 | +debug = 0 |
| 32 | + |
| 33 | +# Keep the modification time of all downloaded pages for an eventual put. |
| 34 | +edittime = {} |
| 35 | + |
| 36 | +# Local exceptions |
| 37 | + |
| 38 | +class Error(Exception): |
| 39 | + """Wikipedia error""" |
| 40 | + |
| 41 | +class NoPage(Error): |
| 42 | + """Wikipedia page does not exist""" |
| 43 | + |
| 44 | +# Library functions |
| 45 | +def unescape(s): |
| 46 | + if '&' not in s: |
| 47 | + return s |
| 48 | + s = s.replace("<", "<") |
| 49 | + s = s.replace(">", ">") |
| 50 | + s = s.replace("'", "'") |
| 51 | + s = s.replace(""", '"') |
| 52 | + s = s.replace("&", "&") # Must be last |
| 53 | + return s |
| 54 | + |
| 55 | +def setAction(s): |
| 56 | + """Set a summary to use for changed page submissions""" |
| 57 | + global action |
| 58 | + action = s |
| 59 | + |
| 60 | +def urlencode(query): |
| 61 | + l=[] |
| 62 | + for k, v in query: |
| 63 | + k = urllib.quote(str(k)) |
| 64 | + v = urllib.quote(str(v)) |
| 65 | + l.append(k + '=' + v) |
| 66 | + return '&'.join(l) |
| 67 | + |
| 68 | +def putPage(code, name, text): |
| 69 | + """Upload 'text' on page 'name' to the 'code' language wikipedia.""" |
| 70 | + import httplib |
| 71 | + host = langs[code] |
| 72 | + if host[-4:] == '.com': |
| 73 | + raise Error("Cannot put pages on a .com wikipedia") |
| 74 | + address = '/w/wiki.phtml?title=%s&action=submit'%(name) |
| 75 | + data = urlencode(( |
| 76 | + ('wpSummary', action), |
| 77 | + ('wpMinoredit', '1'), |
| 78 | + ('wpSave', '1'), |
| 79 | + ('wpEdittime', edittime[code,name]), |
| 80 | + ('wpTextbox1', text))) |
| 81 | + if debug: |
| 82 | + print text |
| 83 | + print address |
| 84 | + print data |
| 85 | + #return None, None, None |
| 86 | + headers = {"Content-type": "application/x-www-form-urlencoded"} |
| 87 | + conn = httplib.HTTPConnection(host) |
| 88 | + conn.request("POST", address, data, headers) |
| 89 | + response = conn.getresponse() |
| 90 | + data = response.read() |
| 91 | + conn.close() |
| 92 | + return response.status, response.reason, data |
| 93 | + |
| 94 | +def getPage(code, name): |
| 95 | + """Get the contents of page 'name' from the 'code' language wikipedia""" |
| 96 | + host = langs[code] |
| 97 | + name = re.sub(' ', '_', name) |
| 98 | + name = urllib.quote(name) |
| 99 | + if host[-4:] == '.org': # New software |
| 100 | + url = 'http://'+host+'/w/wiki.phtml?title='+name+'&action=edit' |
| 101 | + elif host[-4:]=='.com': # Old software |
| 102 | + url = 'http://'+host+'/wiki.cgi?action=edit&id='+name |
| 103 | + if debug: |
| 104 | + print url |
| 105 | + f = urllib.urlopen(url) |
| 106 | + text = f.read() |
| 107 | + f.close() |
| 108 | + m = re.search('value="(\d+)" name=\'wpEdittime\'',text) |
| 109 | + if m: |
| 110 | + edittime[code,name]=m.group(1) |
| 111 | + else: |
| 112 | + m = re.search('value="(\d+)" name="wpEdittime"',text) |
| 113 | + if m: |
| 114 | + edittime[code,name]=m.group(1) |
| 115 | + i1 = re.search('<textarea[^>]*>',text).end() |
| 116 | + i2 = re.search('</textarea>',text).start() |
| 117 | + if i2-i1 < 2: |
| 118 | + raise NoPage() |
| 119 | + elif text[i1:i2] == 'Describe the new page here.\n': |
| 120 | + raise NoPage() |
| 121 | + else: |
| 122 | + return unescape(text[i1:i2]) |
| 123 | + |
| 124 | +def languages(first): |
| 125 | + """Return a list of language codes for known wikipedia servers""" |
| 126 | + result=[] |
| 127 | + for key in first: |
| 128 | + if key in langs.iterkeys(): |
| 129 | + result.append(key) |
| 130 | + for key in langs.iterkeys(): |
| 131 | + if key not in result: |
| 132 | + result.append(key) |
| 133 | + return result |
| 134 | + |
| 135 | +# Part of library dealing with interwiki links |
| 136 | + |
| 137 | +def getLanguageLinks(text): |
| 138 | + """Returns a dictionary of other language links mentioned in the text |
| 139 | + in the form {code:pagename}""" |
| 140 | + result = {} |
| 141 | + for code in langs: |
| 142 | + m=re.search(r'\[\['+code+':([^\]]*)\]\]', text) |
| 143 | + if m: |
| 144 | + result[code] = m.group(1) |
| 145 | + return result |
| 146 | + |
| 147 | +def removeLanguageLinks(text): |
| 148 | + for code in langs: |
| 149 | + text=re.sub(r'\[\['+code+':([^\]]*)\]\]', '', text) |
| 150 | + m=re.search(r'\[\[([a-z][a-z]):([^\]]*)\]\]', text) |
| 151 | + if m: |
| 152 | + print "WARNING: Link to unknown language %s name %s"%(m.group(1), m.group(2)) |
| 153 | + return text |
| 154 | + |
| 155 | +def interwikiFormat(links): |
| 156 | + s='' |
| 157 | + ar=links.keys() |
| 158 | + ar.sort() |
| 159 | + for code in ar: |
| 160 | + s = s + '[[%s:%s]]'%(code, links[code]) |
| 161 | + return s |
| 162 | + |
Property changes on: branches/init1/pywikipedia/wikipedia.py |
___________________________________________________________________ |
Added: svn:keywords |
1 | 163 | + Author Date Id Revision |
Added: svn:eol-style |
2 | 164 | + native |
Property changes on: branches/init1/pywikipedia |
___________________________________________________________________ |
Added: svn:ignore |
3 | 165 | + *.pyc |