#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This bot goes over multiple pages of the home wiki, searches for links, and
show ko-language interwiki(s).
These command line parameters can be used to specify which pages to work on:
¶ms;
-xml Retrieve information from a local XML dump (pages-articles
or pages-meta-current, see http://download.wikimedia.org).
Argument can also be given as "-xml:filename".
-namespace:n Number of namespace to process. The parameter can be used
multiple times. It works in combination with all other
parameters, except for the -start parameter. If you e.g.
want to iterate over all categories starting at M, use
-start:Category:M.
All other parameters will be regarded as part of the title of a single page,
and the bot will only work on that single page.
"""
__version__='$Id: showko.py 4251 2007-09-12 10:36:03Z ... $'
import re, sys, codecs
import date
import pprint
import wikipedia, pagegenerators, catlib
def isdate(s):
"""returns true iff s is a date or year
"""
dict,val = date.getAutoFormat( wikipedia.getSite().language(), s )
return dict is not None
class ShowKoBot:
def __init__(self, generator):
self.generator = generator
linktrail = wikipedia.getSite().linktrail()
# The regular expression which finds links. Results consist of four groups:
# group title is the target page title, that is, everything before | or ].
# group section is the page section. It'll include the # to make life easier for us.
# group label is the alternative link title, that's everything between | and ].
# group linktrail is the link trail, that's letters after ]] which are part of the word.
# note that the definition of 'letter' varies from language to language.
self.linkR = re.compile(r'\[\[(?P<title>[^\]\|#]*)(?P<section>#[^\]\|]*)?(\|(?P<label>[^\]]*))?\]\](?P<linktrail>' + linktrail + ')')
#self.linkInterwikiR = re.compile(r'\[\[(?P<title>ko:[^\]\|#]*)(?P<section>#[^\]\|]*)?(\|(?P<label>[^\]]*))?\]\](?P<linktrail>' + linktrail + ')')
self.dict_visited_links = {}
def isYear(self, text):
reobject1 = re.compile(r'^\[\[[0-9]{1,4}\]\]$') # no section # no label
m1 = reobject1.match(text)
return m1 != None
def isMonth(self, text):
return False
def isMonthDay(self, text):
return False
def convertEnglishDateToKoreanDate(self, page, linktextlist):
#return string - e.g. [[2008??] [[5??3??]
#or return None on failure.
iLen = textlist.length()
iYear = -1; # index
iMonth = -1;
iMonthDay = -1;
return None
def handleNextLinkInLinkedPage(self, page, text, match, context =100):
title = match.group('title')
if not page.site().isInterwikiLink(title):
return None, False
if title.startswith('ko:'):
wikipedia.output('title = ' + title[len(u'ko:'):])
return u'[[' + title[len(u'ko:'):] + u']]', False
return None, False
def handleNextLink(self, page, text, match, context = 100):
"""
Returns a tuple (text, jumpToBeginning).
text is the unicode string after the current link has been processed.
jumpToBeginning is a boolean which specifies if the cursor position
should be reset to 0. This is required after the user has edited the
article.
"""
# ignore interwiki links and links to sections of the same page as well as section links
if not match.group('title') or page.site().isInterwikiLink(match.group('title')) or match.group('section'):
return text, 0, False
if match.group('title').startswith("Image:"):
return text, 0, False
if match.group('title').startswith("ko:"):
return text, 0, False
# consider returning just [[title#section|label]]...
target_title = match.group('title')
if self.dict_visited_links.has_key(target_title):
wikipedia.output(u"_Page %s - already visited" % (target_title))
return text, 0, False
else:
self.dict_visited_links[target_title] = u""
linkedPage = wikipedia.Page(page.site(), target_title)
if linkedPage.isRedirectPage():
try:
linkedPage = linkedPage.getRedirectTarget()
except wikipedia.NoPage:
wikipedia.output(u"_Page %s does not exist?!" % linkedPage.aslink())
return text, 0, False
except wikipedia.LockedPage:
wikipedia.output(u"_Page %s is locked?!" % linkedPage.aslink())
return text, 0, False
try:
oldLinkedPageText = linkedPage.get(get_redirect=True)
wikipedia.output(u"\n\n>>> \03{lightaqua}(%s)(%s)\03{default} <<<" % (target_title, linkedPage.title()))
linkedPageText = oldLinkedPageText
DictLanguageLinks = wikipedia.getLanguageLinks(linkedPageText)
#pprint.pprint(DictLanguageLinks)
target_lang_site = wikipedia.Site(fam='wikipedia', code='ko')
if DictLanguageLinks.has_key(target_lang_site):
target_lang_page = DictLanguageLinks[target_lang_site]
#print target_lang_page.title()
newLink = u'[[' + target_lang_page.title() + u']]'
self.dict_visited_links[target_title] = newLink
return text[:match.start()] + text[match.start():match.end()] + u' ' + newLink + text[match.end():], len(newLink), False
else:
#print 'no target lang page'
return text, 0, False
except wikipedia.NoPage:
wikipedia.output(u"_Page %s does not exist?!" % linkedPage.aslink())
except wikipedia.IsRedirectPage:
wikipedia.output(u"_Page %s is a redirect; skipping." % linkedPage.aslink())
except wikipedia.LockedPage:
wikipedia.output(u"_Page %s is locked?!" % linkedPage.aslink())
return text, 0, False
def treat(self, page):
# Show the title of the page we're working on.
# Highlight the title in purple.
wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title())
try:
oldText = page.get(get_redirect=True)
# Inside image maps, don't touch showkos, as they're used
# to create tooltip labels. See for example:
# http://de.wikipedia.org/w/index.php?title=Innenstadt_%28Bautzen%29&diff=next&oldid=35721641
if '<imagemap>' in oldText:
wikipedia.output(u'Skipping page %s because it contains an image map.' % page.aslink())
return
text = oldText
curpos = 0
while curpos < len(text):
match = self.linkR.search(text, pos = curpos)
if not match:
break
# Make sure that next time around we will not find this same hit.
curpos = match.start() + 1
text, skipCount, jumpToBeginning = self.handleNextLink(page, text, match)
if jumpToBeginning:
curpos = 0
elif skipCount > 0:
#print text[curpos:curpos+30]
curpos = curpos + match.end() - match.start() + 1
#print text[curpos:curpos+30]
# write to disk!
# wikipedia.output(text, toStdout = True)
outputFilename = u"w/" + page.title() + u".txt"
outputFile = codecs.open(outputFilename, 'w', 'utf-8')
if True:
visited_links_list_text = ""
for k, v in self.dict_visited_links.items():
visited_links_list_text += '%s %s\n' % (k, v)
text = visited_links_list_text + text
outputFile.write(text)
outputFile.close()
except wikipedia.NoPage:
wikipedia.output(u"Page %s does not exist?!" % page.aslink())
except wikipedia.IsRedirectPage:
wikipedia.output(u"Page %s is a redirect; skipping." % page.aslink())
except wikipedia.LockedPage:
wikipedia.output(u"Page %s is locked?!" % page.aslink())
except KeyboardInterrupt:
outputFilename = u"w/" + page.title() + u".txt"
outputFile = codecs.open(outputFilename, 'w', 'utf-8')
if True:
visited_links_list_text = ""
for k, v in self.dict_visited_links.items():
visited_links_list_text += '%s %s\n' % (k, v)
text = visited_links_list_text + "\n\n==CLIP==\n\n" + text
outputFile.write(text)
outputFile.close()
def run(self):
for page in self.generator:
self.treat(page)
wikipedia.stopme()
def main():
#page generator
gen = None
pageTitle = ""
for arg in wikipedia.handleArgs():
if arg.startswith('-xml'):
if len(arg) == 4:
xmlFilename = wikipedia.input(u'Please enter the XML dump\'s filename:')
else:
xmlFilename = arg[5:]
gen = XmlDumpSelflinkPageGenerator(xmlFilename)
elif arg.startswith('-page'):
if len(arg) == len('-page'):
pageTitle = wikipedia.input(u'Please enter the title of the page:')
else:
pageTitle = arg[len('-page'):]
if pageTitle:
page = wikipedia.Page(wikipedia.getSite(), pageTitle)
gen = iter([page])
if not gen:
wikipedia.showHelp('showko')
else:
bot = ShowKoBot(gen)
bot.run()
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()