Logo Search packages:      
Sourcecode: uicilibris version File versions  Download package

wikiParser.py

#!/usr/bin/env python
#     $Id: wikiParser.py 45 2011-08-14 17:22:26Z georgesk $ 
#
# wikiParser.py is part of the package uicilibris
#
# uicilibris is based on wiki2beamer's code, which was authored by
# Michael Rentzsch and Kai Dietrich
#
# (c) 2007-2008 Michael Rentzsch (http://www.repc.de)
# (c) 2009-2010 Michael Rentzsch (http://www.repc.de)
#               Kai Dietrich (mail@cleeus.de)
# (c) 2011      Georges Khaznadar (georgesk@ofset.org)
#
# Create high-level parseable code from a wiki-like code, like LaTeX
#
#
#     This file is part of uicilibris.
# uicilibris is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# uicilibris is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with uicilibris.  If not, see <http://www.gnu.org/licenses/>.


import sys, url, StringIO, re
from regularExpressions import *
from BeautifulSoup import BeautifulSoup
from plugin.plugin import plugins
from templateParser import imageParser

wParser=None # global variable: the last and only wikiParser object
             # this global is used by some plugins.

def getWikiContents(completeUrl):
    """
    @param completeUrl the url of a wiki page, encoded in utf-8
    @return the wiki code for the page, else a void string
    """
    completeUrl=completeUrl.replace("index.php/","index.php?title=")+"&action=edit"
    bPage=url.urlopen(completeUrl)
    if bPage:
        soup=BeautifulSoup(bPage.read())
        area=soup.find('textarea', id="wpTextbox1")
        if area and len(area.contents)>0 :
            return area.contents[0]
        else:
            return ""
    else:
        return ""

def findDivById(n,_id):
    """
    browses to search a 'div' with a given id
    @param n a node
    @param _id attribute to search
    @return a node with the right 'div' else None
    """
    return findTagById(n,'div',_id)

def findTagById(n,tag,_id):
    """
    browses to search a subnode with a given tag and a given id
    @param n a node to begin with
    @param tag the tag to search
    @param _id attribute to search
    @return a node with the right tag and id, else None
    """
    for node in n.childNodes:
        if node.nodeType != node.ELEMENT_NODE:
            continue
        if node.nodeName==tag and node.getAttribute('id')==_id:
            return node
        test=findTagById(node,tag,_id)
        if test:
            return test
    return None

############# a callback function for the image parser
def imgFun(d):
    """
    the standarda custom parser for a dictionary coming from an image
    which adds the feature of registering images for further process
    """
    global wParser
    modifiers=""
    if "width" in d:
        modifiers+="width=%s, " %d["width"]
    if "height" in d:
        modifiers+="height=%s, " %d["height"]
    code="\\includegraphics"
    if modifiers:
        code+="[%s]" %modifiers[:-2]
    code+="{%s}" %d["imgFile"]
    figure=False
    caption=""
    for k in d:
        if type(k)==type(0): # for the numeric keys
            if d[k]=="thumb":
                figure=True
            elif caption=="": # the first unindentified value becomes a caption
                figure=True
                caption=d[k]
    if figure:
        code="\\begin{figure}[h!]\n\\begin{center}\n\\caption{%s}\\vspace{0.5em}\n%s\n\\end{center}\n\\end{figure}\n" %(caption, code)
    wParser.registerImage(d["imgFile"])
    return code

00115 class lineJoiner:
    """
    a class to join lines taking in account some wiki syntax
    """

00120     def __init__(self, f, encoding=None):
        """
        the constructor
        @ param f a file-like readable object
        @param encoding triggers some specific encoding if different from None
        """
        self.encoding=encoding
        self.lines=self.joinLines(f.readlines())

00129     def getLines(self):
        """
        @return the lines
        """
        return self.lines

00135     def joinLines(self, lines):
        """
        join lines ending with unescaped percent signs,
        unless inside codemode or nowiki mode
        @param lines a list of text lines
        @return the lines proporly joined
        """
        nowikimode = False
        codemode = False
        r = []  # result array
        s = ''  # new line
        for _l in lines:
            if self.encoding:
                _l=_l.encode(self.encoding)
            (_,nowikimode) = get_nowikimode(_l, nowikimode)
            if not nowikimode:
                (_,codemode) = get_codemode(_l, codemode)

            if not codemode:
                l = _l.rstrip() # return chars are kept only for <code>
            else:
                l = _l

            if l=="" and r and r[-1]!="\n\n" :
                l="\n\n" # simplify and keep the return codes for LaTeX
            if not (nowikimode or codemode) and (len(l) > 1) and (l[-1] == "%") and (l[-2] != "\\"):
                s = s + l[:-1]
            elif not (nowikimode or codemode) and (l == "%"):
                s = s + l[:-1]
            else: # when l=="" or codemode, or nowikimode, or ...
                # output a line
                s = s + l
                if len(s)>0:
                    r.append(s+"\n")
                    s = ''

        return r
    
00173 class txtFileCacher:
    """
    a cache for text files
    """
00177     def __init__(self):
        """
        the constructor
        """
        self.reset()

    def __str__(self):
        return "%s" %self._cache

00186     def reset(self):
        """
        makes a brand new cache
        """
        self._cache = dict()

00192     def addLines(self, filename, lines):
        """
        caches some lines
        @param filename the name of the file which contained the lines
        @param lines the contents
        """
        if not filename in self._cache:
           self._cache[filename] = lines

00201     def getLines(self,filename):
        """
        retrieval of data from the cache. If filename refers to a file
        whose lines are not in the cache, the lines are cached on the fly.
        @param filename
        """
        if filename in self._cache:
            return self._cache[filename]
        else:
            lines = self.read_file_to_lines(filename)
            self._cache[filename] = lines
            return lines
           
00214     def read_file_to_lines(self, filename, fileObj=None, encoding=None):
        """
        read file
        @param filename the name of the file to read
        @param fileObj an already open file objet if it is given
        @param encoding triggers some specific encoding if different from None
        @return the lines read from this file
        """
        if not fileObj:
            f = open(filename, "r")
            lines=lineJoiner(f, encoding).getLines()
            f.close()
        else:
            lines=lineJoiner(fileObj, encoding).getLines()
            self.addLines(filename, lines)

        return lines

00232     def clear(self):
        """
        clears the cache
        """
        self._cache = dict()

00238 class wikiParser:
    """
    A converter from wiki-style layout to many high-level syntaxes
    like LaTeX/Beamer
    """

00244     def __init__(self, args, isatty=True, isUrl=False, report=False):
        """
        The constructor
        @param args a list of filenames, or a single URL.
        When a single URL, it is meant to be the address of a wikimedia
        page which contains a series of other addresses of the same
        mediawiki in mediawiki syntax.
        @param isatty is True when data do not come from the standard input
        @param isUrl is True to force the initialization, by considering arg as a single url which is supposed to be a normal wiki page
        @param report if True, messages are emitted to sys.stderr;
        if it is callable, it is invoked with the same messages
        """
        global wParser
        wParser=self
        self.lines=[]
        input_files = []
        self.cache=txtFileCacher()
        self.imageSet=set()
        self.report=report
        if not isatty:
            input_files.append('stdin')
            self.cache.read_file_to_lines(filename='stdin', fileObj=sys.stdin)
        if isUrl: #forced url system
            self.include_one_address(args[0])
            input_files.append('url')
            self.cache.read_file_to_lines(filename='url', fileObj=self.urlLines)
        elif self.isUrl(args):
            input_files.append('url')
            self.cache.read_file_to_lines(filename='url', fileObj=self.urlLines)
        else:
            input_files += args
            self.lines = []
        for file_ in input_files:
            self.lines += self.include_file_recursive(file_)

00279     def reloadCache(self, text, info=""):
        """
        Reloads the cache from a given text, after running template processors.
        @param text a unicode string with wiki code.
        @param info some informative message
        """
        if info: print >> sys.stderr, "'%s'" %info
        text=self.applyPLugins(text)
        text=self.wikiTemplates(text)
        self.loadUrlLines(text+"\n")
        self.cache.reset()
        self.lines=self.cache.read_file_to_lines(filename='url', fileObj=self.urlLines, encoding="utf-8")

00292     def reloadCacheIndirect(self, text, cbInfo=None):
        """
        Reloads the cache from the current mediawiki. The given text must
        provide a series of wiki addresses.
        @param text an utf-8 string with wiki code.
        @param cbInfo a callback function to display progress messages. It should accept one string as an input.
        """
        wikiAddresses=re.findall("\[\[([^\]]+)\]\]", text)
        self.cache.reset()
        if cbInfo==None:
            cbInfo=self.toStdErr # default callback
        self.include_addresses(wikiAddresses, cbInfo=cbInfo)
        self.url2lines()

00306     def url2lines(self):
        """
        loads self.lines from the file-like object self.urlLines
        """
        self.lines=self.cache.read_file_to_lines(filename='url', fileObj=self.urlLines)
        
00312     def registerImage(self, img):
        """
        registers an image filename to retreive it later
        """
        self.imageSet.add(img)

00318     def imageCount(self):
        """
        @return the count of embedded images
        """
        return len(self.imageSet)

00324     def isUrl(self, arg):
        """
        @return True if arg is a valid Url to a mediawiki.
        As a side-effect, self.lines will be loaded with the
        downloaded contents
        """
        if len(arg)<1:
            return False
        pattern=re.compile("http://(.+)/index.php/(.+)")
        m=pattern.match(arg[0])
        if m:
            self.baseAddress=m.group(1).decode("utf-8")
            self.host=re.findall("([^/]+).*", self.baseAddress)[0]
            basePage=m.group(2).decode("utf-8")
            text=self.getWikiContents(basePage)
            if text:
                wikiAddresses=re.findall("\[\[([^\]]+)\]\]", text)
                self.include_addresses(wikiAddresses, cbInfo=self.toStdErr)
                return True
        return False

00345     def getWikiContents(self, title):
        """
        @param title the title of a wiki page
        @return the wiki code for the page, else a void string
        """
        completeUrl="http://%s/index.php?title=%s" %(self.baseAddress,url.quote_plus(title.encode("utf-8")))
        return getWikiContents(completeUrl)

00353     def applyPLugins(self, s):
        """
        pre-processes a few simple templates which have a precise definition for
        LaTeX, then processes the images.
        @param s the string to be processed
        """
        #== the parsers of every plugin are applied, then the image parser
        parsers=map(lambda c: c(), plugins)+[imageParser(imgFun)]
        for parser in parsers:
            s=parser.sub(s)
        return s
                    
00365     def wikiTemplates(self, contents):
        """
        calls the special page ExpandTemplates in the wiki
        to apply templates which must be processed by mediawiki
        @param contents the code with templates (unicode string)
        @result the code with all templates expanded
        """
        completeUrl="http://%s/index.php/%s" %(self.baseAddress, "Sp%C3%A9cial:ExpandTemplates")
        data={"contexttitle":"",
              "input":"%s" %contents.encode("utf-8"),
              "removecomments":"1",
              "generate_xml":"0"}
        data=url.urlencode(data)
        page=url.urlopen(completeUrl, data)
        soup = BeautifulSoup(page.read())
        area = soup.find('textarea', id="output")
        if area:
            processedContents=area.contents[0]
        else:
            processedContents=""
        return processedContents

00387     def include_one_address(self, completeUrl):
        """
        gets contents from a simple wiki page
        @param completeUrl an url
        """
        text=""
        pattern=re.compile("http://(.+)/index.php/(.+)")
        m=pattern.match(completeUrl)
        if m:
            self.baseAddress=m.group(1).decode("utf-8")
            self.host=re.findall("([^/]+).*", self.baseAddress)[0]
            basePage=m.group(2).decode("utf-8")
            text=self.getWikiContents(basePage)
            if text==None:
                text="Error: the page %s does not exist" %completeUrl
            if self.report and completeUrl:
                print >> sys.stderr, "'%s'" %completeUrl
            elif callable(self.report) and completeUrl:
                self.report("'%s'" %completeUrl)
            text=self.applyPLugins(text) 
            text=self.wikiTemplates(text)
            text=text.encode("utf-8")
        self.loadUrlLines(text)
            
00411     def loadUrlLines(self,text):
        """
        puts a text into the file-like object self.urlLines
        @param text the input
        """
        self.urlLines=StringIO.StringIO(text)

00418     def toStdErr(self, info):
        """
        sends a string to sys.stderr
        @param info the information to display
        """
        if info: print >> sys.stderr, "'%s'" %info
        return
        
00426     def include_addresses(self, wikiAddresses, cbInfo=None):
        """
        populates self.urlLines with data coming from addresses
        self.urlLines will be a file-like object.
        @param wikiAddresses a list of wikiaddresses to visit.
        @param cbInfo a callback used to display an information about each address included. It should accept one string as input
        """
        text=""
        for a in wikiAddresses:
            if cbInfo: cbInfo(a)
            contents=self.getWikiContents(a)
            contents=self.applyPLugins(contents) 
            processedContents=self.wikiTemplates(contents)
            text+="<!-- uicilibris: begin '%s' -->\n" %a
            text+=processedContents+"\n"
            text+="<!-- uicilibris: end '%s' -->\n" %a
        # enforce the same encoding as ordinary text files
        text=text.encode("utf-8")
        # and make it like a text file
        self.urlLines=StringIO.StringIO(text)
            

00448     def include_file_recursive(self, base):
        """
        makes a list of lines from a file, including recursively
        other files when necessary
        @param base the name of the file to process
        @return a list of lines
        """
        stack = []
        output = []
        def recurse(file_):
            stack.append(file_)
            nowikimode = False
            codemode = False
            for line in self.cache.getLines(file_):
                if nowikimode or codemode:
                    if nowikiendre.match(line):
                        nowikimode = False
                    elif codeendre.match(line):
                        codemode = False
                    output.append(line)
                elif nowikistartre.match(line):
                    output.append(line)
                    nowikimode = True
                elif codestartre.match(line):
                    output.append(line)
                    codemode = True
                else:
                    include = self.includeInstruction(line)
                    if include is not None:
                        if include in stack:
                            raise IncludeLoopException('Loop detected while trying '
                                    "to include: '%s'.\n" % include +
                                    'Stack: '+ "->".join(stack))
                        else:
                            recurse(include)
                    else:
                        output.append(line)
            stack.pop()
        recurse(base)
        return output

00489     def includeInstruction(self,line):
        """ Extract filename to include.

        @param line string
            a line that might include an inclusion
        @return string or None
            if the line contains an inclusion, return the filename,
            otherwise return None
        """
        p = re.compile(">>>(.*)<<<", re.VERBOSE)
        if p.match(line):
            filename = p.sub(r"\1", line)
            return filename
        else:
            return None    
                
def get_frame_closing(state):
    return " %s \n\\end{frame}\n" % state.frame_footer

def syntax_error(message, code):
    print >>sys.stderr, 'syntax error: %s' % message
    print >>sys.stderr, '\tcode:\n%s' % code
    sys.exit(-3)

def get_nowikimode(line, nowikimode):
    """
    extracts the "nowiki" feature from a line
    @param line the line to process
    @param nowikimode the current mode regarding the "wiki" property
    @result a tuple (line, nowikimode) after processing
    """
    if not nowikimode and nowikistartre.match(line)!=None:
        line = nowikistartre.sub('', line)
        return (line, True)
    elif nowikimode and nowikiendre.match(line)!=None:
        line = nowikiendre.sub('', line)
        return (line, False)
    else:
        return (line, nowikimode)

def get_codemode(line, codemode):
    if not codemode and codestartre.match(line)!=None:
        line = codestartre.sub('', line)
        return (line, True)
    elif codemode and codeendre.match(line)!=None:
        line = codeendre.sub('', line)
        return (line, False)
    else:
        return (line, codemode)


00540 class IncludeLoopException(Exception):
    pass


if __name__=="__main__":
    print "hello this a demo, here is a list of plugins"
    print plugins

    print "I shall create an instance of each class coming from the list of plugins"
    print "and print them"

    for p in plugins:
        print p()
    print

Generated by  Doxygen 1.6.0   Back to index