#!/usr/bin/env python # Preprocessor for serto font for the use with LaTeX. # Copyright 2001-2003 by Johannes Heinecke # you can use this and change it as you wish, under the condition # that the original copyright line is not deleted FONTFILE="syriac.font" # specify absolute path # TODO: : mit transkription # sErtO: grosse Vokale sind unter der Linie import re, sys, string class Serto: def __init__(self, elatex=0): self.elatex=elatex # eLaTeX needs \TeXXeTstate=1 fp = open(FONTFILE, "r") lines = fp.readlines() fp.close() self.inlineS = re.compile("()(.*?)()") self.inlineT = re.compile("()(.*?)()") self.inlineST = re.compile("()(.*?)()") self.tabelle = {} # "_d": (isol, init, med, fin, link) self.transtabelle = {} # "_d": \d{d} self.fontname = "" status = "serto" for z in lines: if len(z) < 2: continue if z[0] == "#": if z[:6] == "#FONT:": a = string.split(z) self.fontname = string.strip(a[1]) #self.textframe.thetext.config(font=self.fontname) #self.testlabel.config(font=self.fontname) elif z[:7] == "#TRANS:": status = "transliterate" continue felder = string.split(z) if status == "serto": if len(felder) < 7: print "ERROR:", z else: #print z, int(felder[2]) self.tabelle[felder[0]] = (int(felder[2]), int(felder[3]), int(felder[4]), int(felder[5]), int(felder[6])) else: if len(felder) < 2: #print "WARNING:", z self.transtabelle[felder[0]] = felder[0] else: self.transtabelle[felder[0]] = felder[1] def tokenize(self, str, xlen): ix = 0 self.tokens = [] self.digits = [] number = 0 # while(ix < xlen): #for ix in range(xlen): #print "IX", ix, str if str[ix] == "\\": command = "\\" ix = ix + 1 while(ix < xlen): if not str[ix] in string.letters: break else: command = command + str[ix] ix = ix + 1 self.tokens.append(command) elif str[ix] in "{}": self.tokens.append(str[ix]) ix = ix + 1 else: for ll in range(5, 0, -1): if self.tabelle.has_key(str[ix:ix+ll]): if ll == 1 and str[ix:ix+ll] in "aeiou" \ and (len(self.tokens) == 0 \ or self.tokens[-1] == "~"): #self.tokens.append("'" + str[ix:ix+ll]) self.tokens.extend(["'", str[ix:ix+ll]]) #pass else: if len(self.tokens) \ and str[ix:ix+ll] == self.tokens[-1] \ and self.tabelle[str[ix:ix+ll]][4] != 3 \ and str[ix:ix+ll] not in ["~", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]: """insert shadda""" self.tokens.append("Q") else: self.tokens.append(str[ix:ix+ll]) ix = ix + ll break # for-loop else: ix = ix + 1 #print "TOKENS",self.tokens def transtokenize(self, str, xlen): ix = 0 self.tokens = [] self.digits = [] number = 0 # while(ix < xlen): #for ix in range(xlen): #print "IX", ix, for ll in range(5, 0, -1): if self.transtabelle.has_key(str[ix:ix+ll]): if ll == 1 and str[ix:ix+ll] in "aeiou" \ and (len(self.tokens) == 0 \ or self.tokens[-1] == "~"): self.tokens.append("'" + str[ix:ix+ll]) #self.tokens.extend(["'", str[ix:ix+ll]]) else: #if len(self.tokens) \ # and str[ix:ix+ll] == self.tokens[-1] \ # and self.tabelle[str[ix:ix+ll]][4] != 3 \ # and str[ix:ix+ll] not in ["~", "0", "1", "2", "3", "4", # "5", "6", "7", "8", "9"]: # """insert shadda""" # self.tokens.append("Q") #else: self.tokens.append(str[ix:ix+ll]) ix = ix + ll break # for-loop else: ix = ix + 1 #print "TRANSTOKENS",self.tokens def transliterate(self, syrisch): syrisch = string.replace(syrisch, " ", "~") self.transtokenize(syrisch, len(syrisch)) #print self.tokens ret = [] oldtok = "" for tok in self.tokens: if tok == "~": # blank ret.append(" ") #elif tok == "Q": # shadda #ret.append(ret[-1]) #elif tok == "+": # soft sign under begadkefat #if len(ret): # ret[-1] = self.spec.get(oldtok+tok, oldtok+tok) else: ret.append(self.transtabelle.get(tok, tok)) #oldtok = tok #print ret return string.join(ret, "") def syriacise(self): # replace tokens by serto letters, take into account context ix = 0 out = [] digits = [] self.maxlen = len(self.tokens) number = 0 for i in range(self.maxlen): if self.tokens[i][0] in "\\{}": out.append(self.tokens[i]) elif self.tabelle[self.tokens[i]][2] == -1: #print "skipping letter" continue else: form = self.context(i) if self.tokens[i] in ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]: number = 1 digits.append(chr(self.tabelle[self.tokens[i]][form])) else: if number == 1: number = 0 digits.reverse() out.extend(digits) digits = [] #out.append(chr(self.tabelle[self.tokens[i]][form])) out.append("%c" %(self.tabelle[self.tokens[i]][form])) #print self.tokens[i], form, self.tabelle[self.tokens[i]][form] if number: number = 0 digits.reverse() out.extend(digits) #for i in out: print "%d" % ord(i), #print #if not self.elatex: # out.reverse() return string.join(out, "") def context(self, ix): """returns 0 if letter is isolated 1 if letter is initial 2 if letter is medial 3 if letter is final""" if self.before(ix) and self.next(ix): return 2 elif self.before(ix) and not self.next(ix): return 3 elif not self.before(ix) and self.next(ix): return 1 else: return 0 def next(self, ix): """returns 1 if next token is a letter""" for i in range(ix+1, self.maxlen): if self.tokens[i][0] in "\\{}": return 0 elif self.tabelle[self.tokens[i]][4] in [2,3]: continue elif self.tokens[i] not in ["~", "!", ",", ".", ";", "?"] : return 1 else: return 0 return 0 def before(self, ix): """returns 1 if preceding token is a letter""" for i in range(ix-1, -1, -1): if self.tokens[i][0] in "\\{}": return 0 elif self.tabelle[self.tokens[i]][4] == 2: continue elif self.tokens[i] != "~": if self.tabelle[self.tokens[i]][4] == 0: return 0 else: return 1 else: return 0 return 0 def convert(self, transcript): # interface function # dummy blank transcript = string.replace(transcript, " ", "~") self.tokenize(transcript, len(transcript)) return self.syriacise() def texify(self, word): res = [] for ll in serto.convert(word): #print "LETTER", ord(ll) if ord(ll) < 16: #print "WWWWWWWWW", len(res), res if len(res): res[-1] = "\\upperserto{%d}{%s}" % (ord(ll), res[-1]) else: res.append("\\upperserto{%d}{A}" % (ord(ll))) # A: Olaf elif ord(ll) < 32: if len(res): res[-1] = "\\lowerserto{%d}{%s}" % (ord(ll), res[-1]) else: res.append("\\lowerserto{%d}{A}" % (ord(ll))) elif ord(ll) < 127 and ord(ll) not in [34,35,36,37,38,95]: # 95: underscore res.append(ll) else: res.append("\\char%d{}" % ord(ll)) if not self.elatex: res.reverse() return string.join(res, "") def inlineserto(self, matchobject): return "{\\serto\\beginR %s\\endR}" % self.texify(matchobject.group(2)) def inlinetrans(self, matchobject): return "\\emph{%s}" % self.transliterate(matchobject.group(2)) def inlinesertotrans(self, matchobject): return "{\\serto\\beginR %s\\endR} \\emph{%s}" \ % (self.texify(matchobject.group(2)), self.transliterate(matchobject.group(2))) #------------------------------------------------------- if __name__ == "__main__": sys.stderr.write("serto - TeX - preprocessor\n(c) Johannes Heinecke\n") if len(sys.argv) < 2: sys.stderr.write("usage:\n serto.py [-e] inputfile\n") sys.stderr.write(" -e: for usage with elatex\n\n") else: sys.stderr.write("\n") import getopt elatex = 0 optlist,comargs = getopt.getopt(sys.argv[1:], "e") for (o,a) in optlist: if o == "-e": elatex = 1 serto = Serto(elatex=elatex) fp = open(comargs[0]) mode = "latin" z = fp.readline() while (z): #print 'QQQ',z if z[:-1] == "": # must be on a single line (will be deleted) if not elatex: sys.stderr.write("using without the -e option (and elatex) may not work!\n") mode = "serto" print '{\\serto\\beginR %' elif string.strip(z[:-1]) == "": mode = "latin" #print '\\endR}%' # causes problems in last line print '}%' elif z[:-1] == "": mode = "trans" print '{\\it %' elif string.strip(z[:-1]) == "": mode = "latin" print '}%' # elif z[:-1] == "": # mode = "sertotrans" # print '{\\serto\\beginR %' # # elif string.strip(z[:-1]) == "": # mode = "latin" # print '\\endR}%' else: if mode == "latin": #sys.stdout.write(serto.inlineS.sub(serto.inlineserto, z)) a = serto.inlineS.sub(serto.inlineserto, z) b = serto.inlineT.sub(serto.inlinetrans, a) c = serto.inlineST.sub(serto.inlinesertotrans, b) sys.stdout.write(c) elif mode == "trans": print serto.transliterate(z) else: if z[:-1] == "": print "\n\\beginR", else: print serto.texify(z) z = fp.readline() fp.close()