r2 pywikipedia - Code Review archive

Repository:pywikipedia
Revision:r1‎ | r2 | r3 >
Date:20:22, 14 June 2003
Author:hooft
Status:old
Tags:
Comment:
Initial revision
Modified paths:
  • /trunk/pywikipedia (added) (history)
  • /trunk/pywikipedia/.cvsignore (added) (history)
  • /trunk/pywikipedia/getlang.py (added) (history)
  • /trunk/pywikipedia/test.py (added) (history)
  • /trunk/pywikipedia/wikipedia.py (added) (history)

Diff [purge]

Index: trunk/pywikipedia/test.py
@@ -0,0 +1,15 @@
 2+#
 3+# Script to check language links for years A.D.
 4+#
 5+# (C) Rob W.W. Hooft, 2003
 6+# Distribute under the terms of the GPL.
 7+
 8+import sys,wikipedia
 9+
 10+wikipedia.langs={'test':'test.wikipedia.org'}
 11+
 12+text=wikipedia.getPage('test','Robottest')
 13+text=text+'\nrobot was here\n'
 14+status,reason,data=wikipedia.putPage('test','Robottest',text)
 15+print status,reason
 16+
Property changes on: trunk/pywikipedia/test.py
___________________________________________________________________
Added: svn:keywords
117 + Author Date Id Revision
Added: svn:eol-style
218 + native
Index: trunk/pywikipedia/getlang.py
@@ -0,0 +1,119 @@
 2+#
 3+# Script to check language links for years A.D.
 4+#
 5+# (C) Rob W.W. Hooft, 2003
 6+# Distribute under the terms of the GPL.
 7+
 8+import sys,copy,wikipedia
 9+
 10+# languages to check for missing links and modify
 11+mylangs = ('nl', )
 12+
 13+# selection of years to check
 14+years = range(900,999+1)
 15+#years=[879]
 16+
 17+# Set to 1 to actually change the pages
 18+forreal = 1
 19+
 20+# Summary used in the modification request
 21+wikipedia.setAction('Rob Hooft: automatic interwiki script for years')
 22+
 23+debug = 0
 24+
 25+def findlangs(year):
 26+ matrix={}
 27+ text={}
 28+ print year,"=",
 29+ missing=0
 30+ for code in wikipedia.languages(mylangs):
 31+ print code+":", ; sys.stdout.flush()
 32+ try:
 33+ t=wikipedia.getPage(code,year)
 34+ except wikipedia.NoPage:
 35+ if code in mylangs:
 36+ missing+=1
 37+ if missing==len(mylangs):
 38+ # None of the mylangs has this page. Doesn't make sense.
 39+ print
 40+ return None,None
 41+ pass
 42+ else:
 43+ text[code]=t
 44+ l=wikipedia.getLanguageLinks(t)
 45+ l[code]=year # Add self-reference
 46+ matrix[code]=l
 47+ print
 48+ return text,matrix
 49+
 50+def assemblelangmatrix(m):
 51+ result={}
 52+ for dum,line in m.iteritems():
 53+ for code,name in line.iteritems():
 54+ if not code in m:
 55+ pass
 56+ #print "WARNING: Ignore %s from %s; did not see actual page there"%(code,dum)
 57+ elif code in result:
 58+ if result[code]!=name:
 59+ print "WARNING: Name %s is either %s or %s"%(code,result[code],name)
 60+ else:
 61+ result[code]=name
 62+ return result
 63+
 64+def missingLanguages(m,line,thiscode):
 65+ # Figure out whether any references in the assembled references mentioned in
 66+ # line are missing from the language page referred by thiscode.
 67+ result={}
 68+ for code,name in line.iteritems():
 69+ if code==thiscode:
 70+ pass
 71+ elif code in m[thiscode]:
 72+ pass
 73+ else:
 74+ result[code]=name
 75+ for code,name in m[thiscode].iteritems():
 76+ if not code in line:
 77+ print "WARNING: %s contains reference to unknown %s:%s"%(thiscode,code,name)
 78+ elif line[code]!=name:
 79+ print "WARNING: %s reference to %s is %s and not %s"%(thiscode,code,name,line[code])
 80+ return result
 81+
 82+def compareLanguages(old,new):
 83+ removing=[]
 84+ adding=[]
 85+ for code,name in old.iteritems():
 86+ if not new.has_key(code):
 87+ removing.append(code)
 88+ for code,name in new.iteritems():
 89+ if not old.has_key(code):
 90+ adding.append(code)
 91+ s=""
 92+ if adding:
 93+ s=s+" Adding:"+",".join(adding)
 94+ if removing:
 95+ s=s+" Removing:"+",".join(removing)
 96+ return s
 97+
 98+for year in years:
 99+ text,m=findlangs(str(year))
 100+ if m is None:
 101+ # None of the mylangs has this page
 102+ continue
 103+ proper=assemblelangmatrix(m)
 104+ for mycode in mylangs:
 105+ if mycode in m: # Page must be present in this language
 106+ ml=copy.copy(proper)
 107+ status=compareLanguages(m[mycode],ml)
 108+ if status:
 109+ print mycode,str(year),":",status
 110+ del ml[mycode]
 111+ s=wikipedia.interwikiFormat(ml)
 112+ newtext=s+wikipedia.removeLanguageLinks(text[mycode])
 113+ if debug:
 114+ print newtext
 115+ if newtext!=text[mycode]:
 116+ print "NOTE: Replacing %s: %s"%(mycode,s)
 117+ if forreal:
 118+ status,reason,data=wikipedia.putPage(mycode,str(year),newtext)
 119+ if str(status)!='302':
 120+ print status,reason
Property changes on: trunk/pywikipedia/getlang.py
___________________________________________________________________
Added: svn:keywords
1121 + Author Date Id Revision
Added: svn:eol-style
2122 + native
Index: trunk/pywikipedia/.cvsignore
@@ -0,0 +1 @@
 2+*.pyc
Property changes on: trunk/pywikipedia/.cvsignore
___________________________________________________________________
Added: svn:eol-style
13 + native
Index: trunk/pywikipedia/wikipedia.py
@@ -0,0 +1,161 @@
 2+# Library to get and put pages on Wikipedia
 3+import urllib,re
 4+
 5+# known wikipedia languages
 6+langs = {'en':'www.wikipedia.org',
 7+ 'pl':'pl.wikipedia.org',
 8+ 'da':'da.wikipedia.org',
 9+ 'sv':'sv.wikipedia.org',
 10+ 'zh':'zh.wikipedia.org',
 11+ 'eo':'eo.wikipedia.org',
 12+ 'nl':'nl.wikipedia.org',
 13+ 'de':'de.wikipedia.org',
 14+ 'fr':'fr.wikipedia.org',
 15+ 'es':'es.wikipedia.org',
 16+ 'it':'it.wikipedia.com',
 17+ 'no':'no.wikipedia.com',
 18+ 'pt':'pt.wikipedia.com',
 19+ 'af':'af.wikipedia.com',
 20+ 'fy':'fy.wikipedia.com',
 21+ 'la':'la.wikipedia.com',
 22+ 'ca':'ca.wikipedia.com',
 23+ 'fi':'fi.wikipedia.com',
 24+ 'ia':'ia.wikipedia.com',
 25+ 'et':'et.wikipedia.com',
 26+ 'cs':'cs.wikipedia.org',
 27+ }
 28+
 29+action = 'Rob Hooft - Wikipedia python library'
 30+
 31+debug = 0
 32+
 33+# Keep the modification time of all downloaded pages for an eventual put.
 34+edittime = {}
 35+
 36+# Local exceptions
 37+
 38+class Error(Exception):
 39+ """Wikipedia error"""
 40+
 41+class NoPage(Error):
 42+ """Wikipedia page does not exist"""
 43+
 44+# Library functions
 45+def unescape(s):
 46+ if '&' not in s:
 47+ return s
 48+ s = s.replace("&lt;", "<")
 49+ s = s.replace("&gt;", ">")
 50+ s = s.replace("&apos;", "'")
 51+ s = s.replace("&quot;", '"')
 52+ s = s.replace("&amp;", "&") # Must be last
 53+ return s
 54+
 55+def setAction(s):
 56+ """Set a summary to use for changed page submissions"""
 57+ global action
 58+ action = s
 59+
 60+def urlencode(query):
 61+ l=[]
 62+ for k, v in query:
 63+ k = urllib.quote(str(k))
 64+ v = urllib.quote(str(v))
 65+ l.append(k + '=' + v)
 66+ return '&'.join(l)
 67+
 68+def putPage(code, name, text):
 69+ """Upload 'text' on page 'name' to the 'code' language wikipedia."""
 70+ import httplib
 71+ host = langs[code]
 72+ if host[-4:] == '.com':
 73+ raise Error("Cannot put pages on a .com wikipedia")
 74+ address = '/w/wiki.phtml?title=%s&action=submit'%(name)
 75+ data = urlencode((
 76+ ('wpSummary', action),
 77+ ('wpMinoredit', '1'),
 78+ ('wpSave', '1'),
 79+ ('wpEdittime', edittime[code,name]),
 80+ ('wpTextbox1', text)))
 81+ if debug:
 82+ print text
 83+ print address
 84+ print data
 85+ #return None, None, None
 86+ headers = {"Content-type": "application/x-www-form-urlencoded"}
 87+ conn = httplib.HTTPConnection(host)
 88+ conn.request("POST", address, data, headers)
 89+ response = conn.getresponse()
 90+ data = response.read()
 91+ conn.close()
 92+ return response.status, response.reason, data
 93+
 94+def getPage(code, name):
 95+ """Get the contents of page 'name' from the 'code' language wikipedia"""
 96+ host = langs[code]
 97+ name = re.sub(' ', '_', name)
 98+ name = urllib.quote(name)
 99+ if host[-4:] == '.org': # New software
 100+ url = 'http://'+host+'/w/wiki.phtml?title='+name+'&action=edit'
 101+ elif host[-4:]=='.com': # Old software
 102+ url = 'http://'+host+'/wiki.cgi?action=edit&id='+name
 103+ if debug:
 104+ print url
 105+ f = urllib.urlopen(url)
 106+ text = f.read()
 107+ f.close()
 108+ m = re.search('value="(\d+)" name=\'wpEdittime\'',text)
 109+ if m:
 110+ edittime[code,name]=m.group(1)
 111+ else:
 112+ m = re.search('value="(\d+)" name="wpEdittime"',text)
 113+ if m:
 114+ edittime[code,name]=m.group(1)
 115+ i1 = re.search('<textarea[^>]*>',text).end()
 116+ i2 = re.search('</textarea>',text).start()
 117+ if i2-i1 < 2:
 118+ raise NoPage()
 119+ elif text[i1:i2] == 'Describe the new page here.\n':
 120+ raise NoPage()
 121+ else:
 122+ return unescape(text[i1:i2])
 123+
 124+def languages(first):
 125+ """Return a list of language codes for known wikipedia servers"""
 126+ result=[]
 127+ for key in first:
 128+ if key in langs.iterkeys():
 129+ result.append(key)
 130+ for key in langs.iterkeys():
 131+ if key not in result:
 132+ result.append(key)
 133+ return result
 134+
 135+# Part of library dealing with interwiki links
 136+
 137+def getLanguageLinks(text):
 138+ """Returns a dictionary of other language links mentioned in the text
 139+ in the form {code:pagename}"""
 140+ result = {}
 141+ for code in langs:
 142+ m=re.search(r'\[\['+code+':([^\]]*)\]\]', text)
 143+ if m:
 144+ result[code] = m.group(1)
 145+ return result
 146+
 147+def removeLanguageLinks(text):
 148+ for code in langs:
 149+ text=re.sub(r'\[\['+code+':([^\]]*)\]\]', '', text)
 150+ m=re.search(r'\[\[([a-z][a-z]):([^\]]*)\]\]', text)
 151+ if m:
 152+ print "WARNING: Link to unknown language %s name %s"%(m.group(1), m.group(2))
 153+ return text
 154+
 155+def interwikiFormat(links):
 156+ s=''
 157+ ar=links.keys()
 158+ ar.sort()
 159+ for code in ar:
 160+ s = s + '[[%s:%s]]'%(code, links[code])
 161+ return s
 162+
Property changes on: trunk/pywikipedia/wikipedia.py
___________________________________________________________________
Added: svn:keywords
1163 + Author Date Id Revision
Added: svn:eol-style
2164 + native
Property changes on: trunk/pywikipedia
___________________________________________________________________
Added: svn:ignore
3165 + *.pyc

Status & tagging log