사용자:풀빵/스크립트작업실/showko.py

위키백과, 우리 모두의 백과사전.
#!/usr/bin/python
# -*- coding: utf-8  -*-

"""
This bot goes over multiple pages of the home wiki, searches for links, and
show ko-language interwiki(s).

These command line parameters can be used to specify which pages to work on:

&params;

    -xml           Retrieve information from a local XML dump (pages-articles
                   or pages-meta-current, see http://download.wikimedia.org).
                   Argument can also be given as "-xml:filename".

    -namespace:n   Number of namespace to process. The parameter can be used
                   multiple times. It works in combination with all other
                   parameters, except for the -start parameter. If you e.g.
                   want to iterate over all categories starting at M, use
                   -start:Category:M.

All other parameters will be regarded as part of the title of a single page,
and the bot will only work on that single page.
"""

__version__='$Id: showko.py 4251 2007-09-12 10:36:03Z ... $'

import re, sys, codecs
import date
import pprint

import wikipedia, pagegenerators, catlib




def isdate(s):
    """returns true iff s is a date or year
    """
    dict,val = date.getAutoFormat( wikipedia.getSite().language(), s )
    return dict is not None

class ShowKoBot:

    def __init__(self, generator):
        self.generator = generator
        linktrail = wikipedia.getSite().linktrail()
        # The regular expression which finds links. Results consist of four groups:
        # group title is the target page title, that is, everything before | or ].
        # group section is the page section. It'll include the # to make life easier for us.
	# group label is the alternative link title, that's everything between | and ].
        # group linktrail is the link trail, that's letters after ]] which are part of the word.
        # note that the definition of 'letter' varies from language to language.
        self.linkR = re.compile(r'\[\[(?P<title>[^\]\|#]*)(?P<section>#[^\]\|]*)?(\|(?P<label>[^\]]*))?\]\](?P<linktrail>' + linktrail + ')')
        #self.linkInterwikiR = re.compile(r'\[\[(?P<title>ko:[^\]\|#]*)(?P<section>#[^\]\|]*)?(\|(?P<label>[^\]]*))?\]\](?P<linktrail>' + linktrail + ')')

        self.dict_visited_links = {}



    def isYear(self, text):
        reobject1 = re.compile(r'^\[\[[0-9]{1,4}\]\]$')     # no section # no label
        m1 = reobject1.match(text)
        return m1 != None
    def isMonth(self, text):
        return False
    def isMonthDay(self, text):
        return False

    def convertEnglishDateToKoreanDate(self, page, linktextlist):
                #return string  - e.g. [[2008??] [[5??3??]
                #or return None on failure.
        iLen = textlist.length()
        iYear = -1;    # index
        iMonth = -1;
        iMonthDay = -1;


        return None
    def handleNextLinkInLinkedPage(self, page, text, match, context =100):
        title = match.group('title')
        if not page.site().isInterwikiLink(title):
            return None, False
        if title.startswith('ko:'):
            wikipedia.output('title = ' + title[len(u'ko:'):])
            return u'[[' + title[len(u'ko:'):] + u']]', False
        return None, False

    def handleNextLink(self, page, text, match, context = 100):
        """
            Returns a tuple (text, jumpToBeginning).
            text is the unicode string after the current link has been processed.
            jumpToBeginning is a boolean which specifies if the cursor position
            should be reset to 0. This is required after the user has edited the
            article.
        """
        # ignore interwiki links and links to sections of the same page as well as section links
        if not match.group('title') or page.site().isInterwikiLink(match.group('title')) or match.group('section'):
            return text, 0, False
        if match.group('title').startswith("Image:"):
            return text, 0, False
        if match.group('title').startswith("ko:"):
            return text, 0, False

        # consider returning just [[title#section|label]]...

	target_title = match.group('title')
	if self.dict_visited_links.has_key(target_title):
            wikipedia.output(u"_Page %s - already visited" % (target_title))
            return text, 0, False
        else:
            self.dict_visited_links[target_title] = u""
        



        linkedPage = wikipedia.Page(page.site(), target_title)


        
        if linkedPage.isRedirectPage():
                try:
                        linkedPage = linkedPage.getRedirectTarget()
                except wikipedia.NoPage:
                        wikipedia.output(u"_Page %s does not exist?!" % linkedPage.aslink())
                        return text, 0, False
                except wikipedia.LockedPage:
                        wikipedia.output(u"_Page %s is locked?!" % linkedPage.aslink())
                        return text, 0, False
                       
        try:
                oldLinkedPageText = linkedPage.get(get_redirect=True)

                wikipedia.output(u"\n\n>>> \03{lightaqua}(%s)(%s)\03{default} <<<" % (target_title, linkedPage.title()))
                linkedPageText = oldLinkedPageText
		DictLanguageLinks = wikipedia.getLanguageLinks(linkedPageText)
		#pprint.pprint(DictLanguageLinks)

		target_lang_site = wikipedia.Site(fam='wikipedia', code='ko')		

		if DictLanguageLinks.has_key(target_lang_site):
			target_lang_page = DictLanguageLinks[target_lang_site]
                        #print target_lang_page.title()
                        newLink = u'[[' + target_lang_page.title() + u']]'
                        self.dict_visited_links[target_title] = newLink
                        return text[:match.start()] + text[match.start():match.end()] + u' ' + newLink + text[match.end():], len(newLink),  False
      
                else:
                        #print 'no target lang page'
                        return text, 0, False
                       
        except wikipedia.NoPage:
                wikipedia.output(u"_Page %s does not exist?!" % linkedPage.aslink())
        except wikipedia.IsRedirectPage:
                wikipedia.output(u"_Page %s is a redirect; skipping." % linkedPage.aslink())
        except wikipedia.LockedPage:
                wikipedia.output(u"_Page %s is locked?!" % linkedPage.aslink())

        return text, 0, False
   
                   

    def treat(self, page):
            # Show the title of the page we're working on.
            # Highlight the title in purple.
            wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title())
            try:
                oldText = page.get(get_redirect=True)
                # Inside image maps, don't touch showkos, as they're used
                # to create tooltip labels. See for example:
                # http://de.wikipedia.org/w/index.php?title=Innenstadt_%28Bautzen%29&diff=next&oldid=35721641
                if '<imagemap>' in oldText:
                    wikipedia.output(u'Skipping page %s because it contains an image map.' % page.aslink())
                    return
                text = oldText
                curpos = 0
                while curpos < len(text):
                    match = self.linkR.search(text, pos = curpos)
                    if not match:
                        break
                    # Make sure that next time around we will not find this same hit.
                    curpos = match.start() + 1
                    text, skipCount, jumpToBeginning = self.handleNextLink(page, text, match)
                    if jumpToBeginning:
                        curpos = 0
                    elif skipCount > 0:
                        #print text[curpos:curpos+30]
                        curpos = curpos + match.end() - match.start() + 1
                        #print text[curpos:curpos+30]

                # write to disk!
                # wikipedia.output(text, toStdout = True)

                outputFilename = u"w/" + page.title() + u".txt"
                outputFile = codecs.open(outputFilename, 'w', 'utf-8')
             

                if True:
                    visited_links_list_text = ""
                    for k, v in self.dict_visited_links.items():
                        visited_links_list_text += '%s    %s\n' % (k, v)
                    text = visited_links_list_text + text

                outputFile.write(text)
                outputFile.close()
            except wikipedia.NoPage:
                wikipedia.output(u"Page %s does not exist?!" % page.aslink())
            except wikipedia.IsRedirectPage:
                wikipedia.output(u"Page %s is a redirect; skipping." % page.aslink())
            except wikipedia.LockedPage:
                wikipedia.output(u"Page %s is locked?!" % page.aslink())
            except KeyboardInterrupt:
                outputFilename = u"w/" + page.title() + u".txt"
                outputFile = codecs.open(outputFilename, 'w', 'utf-8')

                if True:
                    visited_links_list_text = ""
                    for k, v in self.dict_visited_links.items():
                        visited_links_list_text += '%s    %s\n' % (k, v)
                    text = visited_links_list_text + "\n\n==CLIP==\n\n" + text

                outputFile.write(text)
                outputFile.close()


    def run(self):
        for page in self.generator:
            self.treat(page)
        wikipedia.stopme()

def main():
        #page generator
        gen = None
        pageTitle = ""

        for arg in wikipedia.handleArgs():
                if arg.startswith('-xml'):
                        if len(arg) == 4:
                                xmlFilename = wikipedia.input(u'Please enter the XML dump\'s filename:')
                        else:
                                xmlFilename = arg[5:]
                        gen = XmlDumpSelflinkPageGenerator(xmlFilename)
                elif arg.startswith('-page'):
                        if len(arg) == len('-page'):
                                pageTitle = wikipedia.input(u'Please enter the title of the page:')
                        else:
                                pageTitle = arg[len('-page'):]

        if pageTitle:
                page = wikipedia.Page(wikipedia.getSite(), pageTitle)
                gen = iter([page])
        if not gen:
                wikipedia.showHelp('showko')
        else:
                bot = ShowKoBot(gen)
                bot.run()

if __name__ == "__main__":
        try:
                main()
        finally:
                wikipedia.stopme()