사용자:풀빵/스크립트작업실/replace-project.py

위키백과, 우리 모두의 백과사전.
# -*- coding: utf-8 -*-
"""
replace project
"""
#
# (C) Daniel Herding & the Pywikipediabot Team, 2004-2008
#
# Distributed under the terms of the MIT license.
#

# 토론 페이지 없을 경우 새 토론 페이지가 자동으로 생성되는 것을 막으려면...
#
# choice = wikipedia.inputChoice(
#                u'Do you want to crate a new one?',
#                ['Yes', 'No'], ['y', 'N'], 'N')
# if choice in ['n', 'N']:
#     continue
#
# 이런 식으로 하면 됩니다.

from __future__ import generators
import sys, re, time
import wikipedia, pagegenerators, catlib, config

# Imports predefined replacements tasks from fixes.py
import fixes

# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {
    '&params;':     pagegenerators.parameterHelp,
    '&fixes-help;': fixes.help,
}


# must be set!
g_removal_tmpl = False

#g_prj_tmpl = u'{{화학 프로젝트}}\r\n'
#g_prj_regex = u'\{\{화학 프로젝트\}\}'

#g_prj_tmpl = u'{{화폐 프로젝트}}\r\n'
#g_prj_regex = u'{{화폐 프로젝트}}'

g_prj_tmpl = u'{{화학 프로젝트}}\r\n'
g_prj_regex = u'\{\{화학 프로젝트\}\}'

__version__='$Id: replace.py 5269 2008-04-24 15:19:15Z huji $'

# Summary messages in different languages
# NOTE: Predefined replacement tasks might use their own dictionary, see 'fixes'
# below.`v
msg = {
       'ar':u'%s روبوت : استبدال تلقائي للنص',
       'de':u'Bot: Automatisierte Textersetzung %s',
       'el':u'Ρομπότ: Αυτόματη αντικατάσταση κειμένου %s',
       'en':u'Robot: Automated text replacement %s',
       'es':u'Robot: Reemplazo automático de texto %s',
       'fa':u'ربات: تغییر خودکار متن %s',
       'fr':u'Bot : Remplacement de texte automatisé %s',
       'he':u'בוט: החלפת טקסט אוטומטית %s',
       'hu':u'Robot: Automatikus szövegcsere %s',
       'ia':u'Robot: Reimplaciamento automatic de texto %s',
       'id':u'Bot: Penggantian teks otomatis %s',
       'is':u'Vélmenni: breyti texta %s',
       'it':u'Bot: Sostituzione automatica %s',
       'ja':u'ロボットによる: 文字置き換え %s',
       'ka':u'რობოტი: ტექსტის ავტომატური შეცვლა %s',
       'kk':u'Бот: Мәтінді өздікті алмастырды: %s',
       'ksh':u'Bot: hät outomatesch Täx jetuusch: %s',
       'lt':u'robotas: Automatinis teksto keitimas %s',
       'nds':u'Bot: Text automaatsch utwesselt: %s',
       'nds-nl':u'Bot: autematisch tekse vervungen %s',
       'nl':u'Bot: automatisch tekst vervangen %s',
       'nn':u'robot: automatisk teksterstatting: %s',
       'no':u'bot: Automatisk teksterstatning: %s',
       'pl':u'Robot automatycznie zamienia tekst %s',
       'pt':u'Bot: Mudança automática %s',
       'ru':u'Робот: Автоматизированная замена текста',
       'sr':u'Бот: Аутоматска замена текста %s',
       'sv':u'Bot: Automatisk textersättning: %s',
       'zh': u'機器人:執行文字代換作業 %s',
       }


class XmlDumpReplacePageGenerator:
    """
    Iterator that will yield Pages that might contain text to replace.

    These pages will be retrieved from a local XML dump file.
    Arguments:
        * xmlFilename  - The dump's path, either absolute or relative
        * xmlStart     - Skip all articles in the dump before this one
        * replacements - A list of 2-tuples of original text (as a
                         compiled regular expression) and replacement
                         text (as a string).
        * exceptions   - A dictionary which defines when to ignore an
                         occurence. See docu of the ReplaceRobot
                         constructor below.

    """
    def __init__(self, xmlFilename, xmlStart, replacements, exceptions):
        self.xmlFilename = xmlFilename
        self.replacements = replacements
        self.exceptions = exceptions
        self.xmlStart = xmlStart
        self.skipping = bool(xmlStart)

        self.excsInside = []
        if 'inside-tags' in self.exceptions:
            self.excsInside += self.exceptions['inside-tags']
        if 'inside' in self.exceptions:
            self.excsInside += self.exceptions['inside']
        import xmlreader
        self.site = wikipedia.getSite()
        dump = xmlreader.XmlDump(self.xmlFilename)
        self.parser = dump.parse()

    def __iter__(self):
        try:
            for entry in self.parser:
                if self.skipping:
                    if entry.title != self.xmlStart:
                        continue
                    self.skipping = False
                if not self.isTitleExcepted(entry.title) \
                        and not self.isTextExcepted(entry.text):
                    new_text = entry.text
                    for old, new in self.replacements:
                        new_text = wikipedia.replaceExcept(
                                        new_text, old, new, self.excsInside)
                    if new_text != entry.text:
                        yield wikipedia.Page(self.site, entry.title)
        except KeyboardInterrupt:
            try:
                if not self.skipping:
                    wikipedia.output(
                        u'To resume, use "-xmlstart:%s" on the command line.'
                        % entry.title)
            except NameError:
                pass

    def isTitleExcepted(self, title):
        if 'title' in self.exceptions:
            for exc in self.exceptions['title']:
                if exc.search(title):
                    return True
        return False

    def isTextExcepted(self, text):
        if 'text-contains' in self.exceptions:
            for exc in self.exceptions['text-contains']:
                if exc.search(text):
                    return True
        return False


class ReplaceRobot:
    """
    A bot that can do text replacements.
    """
    def __init__(self, generator, replacements, exceptions={},
                 acceptall=False, allowoverlap=False, recursive=False,
                 addedCat=None, sleep=None):
        """
        Arguments:
            * generator    - A generator that yields Page objects.
            * replacements - A list of 2-tuples of original text (as a
                             compiled regular expression) and replacement
                             text (as a string).
            * exceptions   - A dictionary which defines when not to change an
                             occurence. See below.
            * acceptall    - If True, the user won't be prompted before changes
                             are made.
            * allowoverlap - If True, when matches overlap, all of them are
                             replaced.
            * addedCat     - If set to a value, add this category to every page
                             touched.

        Structure of the exceptions dictionary:
        This dictionary can have these keys:

            title
                A list of regular expressions. All pages with titles that
                are matched by one of these regular expressions are skipped.
            text-contains
                A list of regular expressions. All pages with text that
                contains a part which is matched by one of these regular
                expressions are skipped.
            inside
                A list of regular expressions. All occurences are skipped which
                lie within a text region which is matched by one of these
                regular expressions.
            inside-tags
                A list of strings. These strings must be keys from the
                exceptionRegexes dictionary in wikipedia.replaceExcept().

        """
        self.generator = generator
        self.replacements = replacements
        self.exceptions = exceptions
        self.acceptall = acceptall
        self.allowoverlap = allowoverlap
        self.recursive = recursive
        if addedCat:
            site = wikipedia.getSite()
            cat_ns = site.category_namespaces()[0]
            self.addedCat = wikipedia.Page(site,
                                           cat_ns + ':' + addedCat)
        self.sleep = sleep

    def isTitleExcepted(self, title):
        """
        Iff one of the exceptions applies for the given title, returns True.
        """
        if 'title' in self.exceptions:
            for exc in self.exceptions['title']:
                if exc.search(title):
                    return True
        return False

    def isTextExcepted(self, original_text):
        """
        Iff one of the exceptions applies for the given page contents,
        returns True.
        """
        if 'text-contains' in self.exceptions:
            for exc in self.exceptions['text-contains']:
                if exc.search(original_text):
                    return True
        return False

    def doReplacements(self, original_text):
        """
        Returns the text which is generated by applying all replacements to
        the given text.
        """
        new_text = original_text
        exceptions = []
        if 'inside-tags' in self.exceptions:
            exceptions += self.exceptions['inside-tags']
        if 'inside' in self.exceptions:
            exceptions += self.exceptions['inside']
        for old, new in self.replacements:
            if self.sleep != None:
                time.sleep(self.sleep)
            new_text = wikipedia.replaceExcept(new_text, old, new, exceptions,
                                               allowoverlap=self.allowoverlap)
        return new_text

    def run(self):
        """
        Starts the robot.
        """
        # Run the generator which will yield Pages which might need to be
        # changed.
        new_text = u''
	removal_tmpl_mode = False
        try:
            for page in self.generator:
                if self.isTitleExcepted(page.title()):
                    wikipedia.output(
                        u'Skipping %s because the title is on the exceptions list.'
                        % page.aslink())
                    continue
                try:
                    # Load the talkpage
                    if page.isTalkPage():
                        wikipedia.output(u'isTalkPage')
                        continue
                    #page.toggleTalkPage()
                    page = wikipedia.Page(page.site(),
                        page.site().namespace(page.namespace() + 1) + ':'
                          + page.titleWithoutNamespace())
                    wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
                                     % page.title())
                    if not page.isTalkPage():
                        wikipedia.output(u'not page.isTalkPage')
                        continue

                    if page.namespace() == 11 and g_removal_tmpl:
                        removal_tmpl_mode = True
                    if page.namespace() == 11 and not g_removal_tmpl:
                        continue
			
                    # Load the page's text from the wiki
                    if not page.exists() and not removal_tmpl_mode:
                        wikipedia.output(u'The talkpage is not exist... create a new one %s' % page.title())
                        new_text = g_prj_tmpl
                        try:
                            page.put_async(new_text, minorEdit = True)
                        except wikipedia.EditConflict:
                            wikipedia.output(u'Skipping %s because of edit conflict'
                                         % (page.title(),))
                        except wikipedia.SpamfilterError, e:
                            wikipedia.output(
                               u'Cannot change %s because of blacklist entry %s'
                                % (page.title(), e.url))
                        except wikipedia.PageNotSaved, error:
                            wikipedia.output(u'Error putting page: %s'
                                         % (error.args,))
                        except wikipedia.LockedPage:
                            wikipedia.output(u'Skipping %s (locked page)'
                                         % (page.title(),))
                        continue
                    else:
                        wikipedia.output(u'The talkpage seems to be exist... edit %s' % page.title())
                        original_text = page.get(get_redirect=True)
                        if not removal_tmpl_mode : 
                            #wikipedia.output(u'search %s in ..... ' % g_prj_regex)
                            pt = re.search(g_prj_regex, original_text)
                            if pt != None:
                               wikipedia.output(u'The template is found... ')
                               continue
                            if not page.canBeEdited():
                                wikipedia.output(u"You can't edit page %s"
                                         % page.aslink())
                                continue
                            new_text = g_prj_tmpl + original_text
                        else: # removal_tmpl_mode
                            wikipedia.output(u'remove %s in ..... ' % g_prj_regex)
                            new_text = self.doReplacements(original_text)
                       
                except wikipedia.NoPage:
                    wikipedia.output(u'Page %s not found' % page.aslink())
                    continue
                # Show the title of the page we're working on.
                # Highlight the title in purple.
                wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
                                 % page.title())
                wikipedia.showDiff(original_text, new_text)
                if not self.acceptall:
                    choice = wikipedia.inputChoice(
                                u'Do you want to accept these changes?',
                                ['Yes', 'No', 'All'], ['y', 'N', 'a'], 'N')
                    if choice in ['a', 'A']:
                        self.acceptall = True
                    if choice in ['y', 'Y']:
                        page.put_async(new_text)
                if self.acceptall:
                    try:
                        page.put_async(new_text)
                    except wikipedia.EditConflict:
                        wikipedia.output(u'Skipping %s because of edit conflict'
                                         % (page.title(),))
                    except wikipedia.SpamfilterError, e:
                        wikipedia.output(
                            u'Cannot change %s because of blacklist entry %s'
                            % (page.title(), e.url))
                    except wikipedia.PageNotSaved, error:
                        wikipedia.output(u'Error putting page: %s'
                                         % (error.args,))
                    except wikipedia.LockedPage:
                        wikipedia.output(u'Skipping %s (locked page)'
                                         % (page.title(),))
        finally:
            wikipedia.output(u'... finally:!')
            pass

def prepareRegexForMySQL(pattern):
    pattern = pattern.replace('\s', '[:space:]')
    pattern = pattern.replace('\d', '[:digit:]')
    pattern = pattern.replace('\w', '[:alnum:]')

    pattern = pattern.replace("'", "\\" + "'")
    #pattern = pattern.replace('\\', '\\\\')
    #for char in ['[', ']', "'"]:
    #    pattern = pattern.replace(char, '\%s' % char)
    return pattern


def main():
    add_cat = None
    gen = None
    # summary message
    summary_commandline = None
    # Array which will collect commandline parameters.
    # First element is original text, second element is replacement text.
    commandline_replacements = []
    # A list of 2-tuples of original text and replacement text.
    replacements = []
    # Don't edit pages which contain certain texts.
    exceptions = {
        'title':         [],
        'text-contains': [],
        'inside':        [],
        'inside-tags':   [],
    }
    # Should the elements of 'replacements' and 'exceptions' be interpreted
    # as regular expressions?
    regex = False
    # Predefined fixes from dictionary 'fixes' (see above).
    fix = None
    # the dump's path, either absolute or relative, which will be used
    # if -xml flag is present
    xmlFilename = None
    useSql = False
    PageTitles = []
    # will become True when the user presses a ('yes to all') or uses the
    # -always flag.
    acceptall = False
    # Will become True if the user inputs the commandline parameter -nocase
    caseInsensitive = False
    # Which namespaces should be processed?
    # default to [] which means all namespaces will be processed
    namespaces = []
    # Do all hits when they overlap
    allowoverlap = False
    # Do not recurse replacement
    recursive = False
    # This factory is responsible for processing command line arguments
    # that are also used by other scripts and that determine on which pages
    # to work on.
    genFactory = pagegenerators.GeneratorFactory()
    # Load default summary message.
    # BUG WARNING: This is probably incompatible with the -lang parameter.
    wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg))
    # Between a regex and another (using -fix) sleep some time (not to waste
    # too much CPU
    sleep = None

    # Read commandline parameters.
    for arg in wikipedia.handleArgs():
        if arg == '-regex':
            regex = True
        elif arg.startswith('-xmlstart'):
            if len(arg) == 9:
                xmlStart = wikipedia.input(
                    u'Please enter the dumped article to start with:')
            else:
                xmlStart = arg[10:]
        elif arg.startswith('-xml'):
            if len(arg) == 4:
                xmlFilename = wikipedia.input(
                    u'Please enter the XML dump\'s filename:')
            else:
                xmlFilename = arg[5:]
        elif arg =='-sql':
            useSql = True
        elif arg.startswith('-page'):
            if len(arg) == 5:
                PageTitles.append(wikipedia.input(
                                    u'Which page do you want to change?'))
            else:
                PageTitles.append(arg[6:])
        elif arg.startswith('-excepttitle:'):
            exceptions['title'].append(arg[13:])
        elif arg.startswith('-excepttext:'):
            exceptions['text-contains'].append(arg[12:])
        elif arg.startswith('-exceptinside:'):
            exceptions['inside'].append(arg[14:])
        elif arg.startswith('-exceptinsidetag:'):
            exceptions['inside-tags'].append(arg[17:])
        elif arg.startswith('-fix:'):
            fix = arg[5:]
        elif arg.startswith('-sleep:'):
            sleep = float(arg[7:])           
        elif arg == '-always':
            acceptall = True
        elif arg == '-recursive':
            recursive = True
        elif arg == '-nocase':
            caseInsensitive = True
        elif arg.startswith('-addcat:'):
            add_cat = arg[8:]
        elif arg.startswith('-namespace:'):
            try:
                namespaces.append(int(arg[11:]))
            except ValueError:
                namespaces.append(arg[11:])
        elif arg.startswith('-summary:'):
            wikipedia.setAction(arg[9:])
            summary_commandline = True
        elif arg.startswith('-allowoverlap'):
            allowoverlap = True
        else:
            generator = genFactory.handleArg(arg)
            if generator:
                gen = generator
            else:
                commandline_replacements.append(arg)

    if len(commandline_replacements) % 2:
        raise wikipedia.Error, 'require even number of replacements.'
    elif len(commandline_replacements) == 2 and fix == None:
        replacements.append((commandline_replacements[0],
                             commandline_replacements[1]))
        if summary_commandline == None:
            wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg )
                                % (' (-' + commandline_replacements[0] + ' +'
                                   + commandline_replacements[1] + ')'))
    elif len(commandline_replacements) > 1:
        if fix == None:
            for i in xrange (0, len(commandline_replacements), 2):
                replacements.append((commandline_replacements[i],
                                     commandline_replacements[i + 1]))
            if summary_commandline == None:
                pairs = [( commandline_replacements[i],
                           commandline_replacements[i + 1] )
                         for i in range(0, len(commandline_replacements), 2)]
                replacementsDescription = '(%s)' % ', '.join(
                    [('-' + pair[0] + ' +' + pair[1]) for pair in pairs])
                wikipedia.setAction(
                    wikipedia.translate(wikipedia.getSite(), msg )
                    % replacementsDescription)
        else:
           raise wikipedia.Error(
               'Specifying -fix with replacements is undefined')
    elif fix == None:
        old = wikipedia.input(u'Please enter the text that should be replaced:')
        new = wikipedia.input(u'Please enter the new text:')
        change = '(-' + old + ' +' + new
        replacements.append((old, new))
        while True:
            old = wikipedia.input(
u'Please enter another text that should be replaced, or press Enter to start:')
            if old == '':
                change += ')'
                break
            new = wikipedia.input(u'Please enter the new text:')
            change = change + ' & -' + old + ' +' + new
            replacements.append((old, new))
        if summary_commandline != True:
            default_summary_message =  wikipedia.translate(wikipedia.getSite(), msg) % change
            wikipedia.output(u'The summary message will default to: %s'
                             % default_summary_message)
            summary_message = wikipedia.input(
u'Press Enter to use this default message, or enter a description of the\nchanges your bot will make:')
            if summary_message == '':
                summary_message = default_summary_message
            wikipedia.setAction(summary_message)

    else:
        # Perform one of the predefined actions.
        try:
            fix = fixes.fixes[fix]
        except KeyError:
            wikipedia.output(u'Available predefined fixes are: %s'
                             % fixes.fixes.keys())
            wikipedia.stopme()
            sys.exit()
        if 'regex' in fix:
            regex = fix['regex']
        if 'msg' in fix:
            wikipedia.setAction(
                wikipedia.translate(wikipedia.getSite(), fix['msg']))
        if 'exceptions' in fix:
            exceptions = fix['exceptions']
        replacements = fix['replacements']

    # already compile all regular expressions here to save time later
    for i in range(len(replacements)):
        old, new = replacements[i]
        if not regex:
            old = re.escape(old)
        if caseInsensitive:
            oldR = re.compile(old, re.UNICODE | re.IGNORECASE)
        else:
            oldR = re.compile(old, re.UNICODE)
        replacements[i] = oldR, new

    for exceptionCategory in ['title', 'text-contains', 'inside']:
        if exceptionCategory in exceptions:
            patterns = exceptions[exceptionCategory]
            if not regex:
                patterns = [re.escape(pattern) for pattern in patterns]
            if caseInsensitive:
                patterns = [re.compile(pattern, re.UNICODE | re.IGNORECASE)
                            for pattern in patterns]
            else:
                patterns = [re.compile(pattern, re.UNICODE)
                            for pattern in patterns]
            exceptions[exceptionCategory] = patterns

    if xmlFilename:
        try:
            xmlStart
        except NameError:
            xmlStart = None
        gen = XmlDumpReplacePageGenerator(xmlFilename, xmlStart,
                                          replacements, exceptions)
    elif useSql:
        whereClause = 'WHERE (%s)' % ' OR '.join(
            ["old_text RLIKE '%s'" % prepareRegexForMySQL(old.pattern)
             for (old, new) in replacements])
        if exceptions:
            exceptClause = 'AND NOT (%s)' % ' OR '.join(
                ["old_text RLIKE '%s'" % prepareRegexForMySQL(exc.pattern)
                 for exc in exceptions])
        else:
            exceptClause = ''
        query = u"""
SELECT page_namespace, page_title
FROM page
JOIN text ON (page_id = old_id)
%s
%s
LIMIT 200""" % (whereClause, exceptClause)
        gen = pagegenerators.MySQLPageGenerator(query)

    elif PageTitles:
        pages = [wikipedia.Page(wikipedia.getSite(), PageTitle)
                 for PageTitle in PageTitles]
        gen = iter(pages)

    if not gen:
        # syntax error, show help text from the top of this file
        wikipedia.showHelp('replace')
        wikipedia.stopme()
        sys.exit()
    if namespaces != []:
        gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
    if xmlFilename:
        # XML parsing can be quite slow, so use smaller batches and
        # longer lookahead.
        preloadingGen = pagegenerators.PreloadingGenerator(gen,
                                            pageNumber=20, lookahead=100)
    else:
        preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=60)
    bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall, allowoverlap, recursive, add_cat, sleep)
    bot.run()

if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()