#!/usr/bin/env python # -*- coding: iso-8859-15 -*- # K M Briggs 2009-02-03 10:45 import re from sys import stderr,exit,argv import cPickle from os.path import expanduser from tkinter_app_00 import App,win if win: corpus_path='./0163/' else: corpus_path='~/0163/' fixes={ '&d;': '{dh}', '&t;': '{th}', '&D;': '{DH}', '&T;': '{TH}', '&ae;': '{ae}', '&AE;': '{AE}', '&': '{&}', 'ö': '{oe}' # '\xc3', } def fix(t): for f in fixes: t=t.replace(f,fixes[f]) return t # re_tei_2=re.compile(r'T\d{5})">(?P.*?)',re.DOTALL|re.MULTILINE) # &T;onon to cuntan heale. re_s=re.compile(r'T\d{11})"\s+n="(?P\d+(\.\d+)?)">(?P.*?)',re.DOTALL|re.MULTILINE) # Bounds, Sawyer 1: Birch 1885-99, no. 3 re_sourcedesc=re.compile(r'\s*\s*(?P.*?)\s*\s*',re.DOTALL|re.MULTILINE) def get_tei2_dictionary(): try: pkl=open('text_dictionary.pkl','rb') tei2_d=cPickle.load(pkl) pkl.close() return tei2_d except: print>>stderr,'building text_dictionary.pkl...', t='' for n in range(1,78): print>>stderr,'seg%02d'%n, f=open(expanduser(corpus_path+'/oecorp/seg%02d'%n),'r') t+=f.read() f.close() print>>stderr d={} tei2_d={} tei_2s=re_tei_2.finditer(t) for tei_2 in tei_2s: # for each text t=tei_2.group('txt') ms=re_s.finditer(t) sd=re_sourcedesc.search(t).group('sd') full_text=[] for m in ms: # for each line in the text txt=m.group('txt') id=m.group('id') n=m.group('n') txt=txt.replace('','[|').replace('','|]') txt=txt.replace('','[[').replace('',']]') txt=txt.replace('\n',' ') txt=fix(txt) full_text.append(txt) for word in txt.split(): if word[0]=='[': continue word=word.rstrip(',.') d.setdefault(word,[]).append((id,n)) ft=''.join(full_text).strip() tei2_d[tei_2.group('id')]=ft pkl=open('text_dictionary.pkl','wb') cPickle.dump(tei2_d,pkl,-1) pkl.close() return tei2_d def get_desc_dictionary(): fn='desc_dictionary.pkl' try: pkl=open(fn,'rb') d=cPickle.load(pkl) pkl.close() return d except: print>>stderr,'building %s...'%fn, t='' for n in range(1,78): print>>stderr,'seg%02d'%n, f=open(expanduser(corpus_path+'/oecorp/seg%02d'%n),'r') t+=f.read() f.close() d={} tei_2s=re_tei_2.finditer(t) for tei_2 in tei_2s: # for each text sd=re_sourcedesc.search(tei_2.group(0)).group('sd') d[tei_2.group('id')]=fix(sd) pkl=open(fn,'wb') cPickle.dump(d,pkl,-1) pkl.close() return d def to_ascii(t): t=t.replace('{oe}','ö') t=t.replace('{ae}','æ') t=t.replace('{AE}','Æ') t=t.replace('{dh}','ð') t=t.replace('{DH}','Ð') t=t.replace('{th}','þ') t=t.replace('{TH}','Þ') t=t.replace('{&}','&') return t def context(txt,s,e,span=15): n=len(txt) a,b=max(0,s-span),min(e+span,n) while a>=0 and txt[a]!=' ': a-=1 while b