# Keith Briggs 2017-07-23 html output; nodate # coding=utf8 # Keith Briggs 2017-07-17 en dash from sys import stderr,path,exit import re from regnal_year_03 import parse_regnal_year_spec,get_regnalyear,int_to_roman wordy_expander={ '': '', '<': 'before ', '>': 'after ', 'circa': 'circa ', 'c': 'circa ', 'C': 'circa ', 'nd': 'no date, ', 'n.d.': 'no date, ', 'E': 'early ', 'M': 'middle of the ', 'L': 'late ', 'EM': 'early to middle ', 'ML': 'middle to late ', '1H': 'first half of the ', '2H': 'second half of the ', '1T': 'first third of the ', '2T': 'middle third of the ', '3T': 'last third of the ', '1Q': 'first quarter of the ', '2Q': 'second quarter of the ', '3Q': 'third quarter of the ', '4Q': 'fourth quarter of the ', } eml_shift={ 'E': 33.0, 'M': 66.0, 'L': 100.0, 'EM': 75.0, 'ML': 100.0, '1H': 50.0, '2H': 100.0, '1T': 33.0, '2T': 66.0, '3T': 100.0, '1Q': 25.0, '2Q': 50.0, '3Q': 75.0, '4Q': 100.0, } nodate ='(nd)|(n\.d\.)|(no date)' prenote ='(?P(%s)|(\[(.*?)\]))?'%nodate postnote ='(\[(?P.*?)\])?' circa ='(?P(c\.?)|(circa))' uncertain ='(?P\?)' ba ='(?P[<>])' # before or after half ='[12]h' third ='[123]t' quarter ='[1234]q' eml ='(em)|(ml)|[eml]' prefix ='(?P(%s)|(%s)|(%s)|(%s))'%('%d',half,third,quarter,eml,) simplerange='(?P(1\d\d\d[-]\d)|(1\d\d\d[-]\d\d)$)' # post-1000 only century ='(?P\d\d?)[Cc]' decade ='(?P\d{2,3}0)s' year ='(?P\d{1,4})' oldstyle ='(?P(\d{3}[012345678]/\d)|(\d{3}9/\d{2}))' first_dict={ 'uncertain': uncertain%0, 'ba': ba%0, 'circa': circa%0, 'prefix': prefix%0, 'year': year%0, 'simplerange': simplerange, 'century': century%0, 'decade': decade%0, 'oldstyle': oldstyle%0, } second_dict={ 'uncertain': uncertain%1, 'ba': ba%1, 'circa': circa%1, 'prefix': prefix%1, 'year': year%1, 'century': century%1, 'decade': decade%1, 'oldstyle': oldstyle%1, } first ='{uncertain}?{ba}?{circa}?{prefix}?({simplerange}|{oldstyle}|{decade}|{year}|{century})'.format(**first_dict) second='{uncertain}?{ba}?{circa}?{prefix}?({oldstyle}|{decade}|{year}|{century})'.format(**second_dict) second='((?P([x-])|([-–]{2}))'+second+')?' # note en dash here dd=prenote+first+second+postnote+'$' print(dd) re_dd=re.compile(dd,flags=re.IGNORECASE) re_spaces=re.compile(r'(\s{2,})') re_circa=re.compile(r'c.(.*?)(?=$|-|x)') # lookahead assertion def simplerange_to_float(x): a,b=x.split('-') if len(b)==1: return float(a[:3]+b) return float(a[:2]+b) class DDD: ' document date descriptor ' def __init__(s,dd,verbose=False): s.verbose=verbose s.clean='' s.wordy='' s.sortkey=-1.0 # indicates not yet defined if not dd: # empty input s.ok=False; return s.dd=dd # save exact input s.regnalyear=None,None,None dds=dd.strip() # special case: regnal year (these cannot be uncertain)... ry=parse_regnal_year_spec(dds) if ry: if ry=='error': s.ok=False s.sortkey=-1.0 s.wordy=s.clean='DDDFIXME(%s)'%dd return if s.verbose: print('ry="%s"'%(ry,)) s.ok=True s.regnalyear=ry year,monarch=ry[0],ry[1]+' '+str(ry[2]) actual_year=get_regnalyear(year,monarch) if s.verbose: print('year="%s", monarch="%s", actual_year="%s"'%(year,monarch,actual_year,)) if s.verbose: print('int_to_roman(ry[2])="%s"'%(int_to_roman(ry[2]))) monarch_name=(monarch.split())[0] s.clean=str(year)+' '+monarch_name+' '+int_to_roman(ry[2]) if s.verbose: print('s.clean="%s"'%(s.clean,)) s.wordy=s.clean+' (%s)'%actual_year s.latex=s.clean+' (%s)'%actual_year s.latex=s.latex.replace('<','$<$').replace('>','$>$') if '/' in actual_year: s.oldstyle=actual_year x,y=actual_year.split('/') s.sortkey=float(x)+1.0 else: s.sortkey=float(actual_year) return # general case... sortkey_offset0=sortkey_offset1=0.0 m=re_dd.match(dds) if not m: s.ok=False return s.ok=True year0='' d=m.groupdict() items=list(d.items()) year0=None nodate=prenote=postnote=c_rangesep=rangesep='' ba0=circa0=prefix0=year0=decade0=century0=oldstyle0=uncertain0='' ba1=circa1=prefix1=year1=decade1=century1=oldstyle1=uncertain1='' c_ba0=c_circa0=c_prefix0=c_year0=c_decade0=c_century0=c_oldstyle0=c_uncertain0='' c_ba1=c_circa1=c_prefix1=c_year1=c_decade1=c_century1=c_oldstyle1=c_uncertain1='' eml0=eml1=False for key,value in items: if value is None: continue if s.verbose: print('key="%s" value="%s"'%(key,value,)) if key=='simplerange': c_year0=year0=value s.sortkey=simplerange_to_float(value) break if key=='prenote': prenote=value.strip('[]') if prenote=='nd' or prenote=='n.d.' or prenote=='no date': nodate='n.d.' prenote='' continue if key=='rangesep': c_rangesep=value rangesep=' to ' continue if key=='postnote': postnote=value.strip('[]') continue if key=='ba0': c_ba0=value ba0=wordy_expander[value] continue if key=='ba1': c_ba1=value ba1=wordy_expander[value] continue if key=='circa0': c_circa0='c.' circa0='circa ' continue if key=='circa1': c_circa1='c.' circa1='circa ' continue if key=='uncertain0': c_uncertain0='?' uncertain0='perhaps ' continue if key=='uncertain1': c_uncertain1='?' uncertain1='perhaps ' continue if key=='prefix0': c_prefix0=value.lower() prefix0=wordy_expander[value.upper()] sortkey_offset0=eml_shift[value.upper()] if c_prefix0 in ('e','m','l','em','ml'): eml0=True continue if key=='prefix1': c_prefix1=value.lower() prefix1=' the '+wordy_expander[value.upper()] sortkey_offset1=eml_shift[value.upper()] if c_prefix1 in ('e','m','l','em','ml'): eml1=True continue if key=='decade0': c_decade0=decade0=value+'s' s.sortkey=float(value)+10.0 if eml0: sortkey_offset0/=10.0 sortkey_offset0-=10.0 #print('#',key,value,s.sortkey,sortkey_offset0) continue if key=='decade1': s.sortkey=float(value)+10.0 c_decade1=decade1=value+'s' if eml1: sortkey_offset1/=10.0 sortkey_offset1-=10.0 continue if key=='century0': s.sortkey=100.0*float(value)-100.0 c_century0='%sC'%value century0=' %sth century'%value continue if key=='century1': s.sortkey=100.0*float(value)-100.0 c_century1='%sC'%value century1=' %sth century'%value continue if key=='year0': c_year0=year0=value s.sortkey=float(year0) continue if key=='year1': c_year1=year1=value s.sortkey=float(year1) continue if key=='oldstyle0': c_oldstyle0=oldstyle0=value i=value.index('/') s.sortkey=float(value[:i])+1.0 continue if key=='oldstyle1': c_oldstyle1=oldstyle1=value i=value.index('/') s.sortkey=float(value[:i])+1.0 continue if sortkey_offset1: s.sortkey+=sortkey_offset1 else: s.sortkey+=sortkey_offset0 clean=' '.join((nodate,c_uncertain0,c_ba0,c_circa0,c_prefix0,c_year0,c_century0,c_decade0,c_oldstyle0,c_rangesep,c_uncertain1,c_ba1,c_circa1,c_prefix1,c_year1,c_century1,c_decade1,c_oldstyle1,)) wordy=' '.join((wordy_expander[nodate],prenote,uncertain0,ba0,circa0,prefix0,year0,century0,decade0,oldstyle0,rangesep,uncertain1,ba1,circa1,prefix1,year1,century1,decade1,oldstyle1,postnote,)) clean=clean.replace('-',u'–') # en dash wordy=re_spaces.sub(' ',wordy) s.wordy=wordy.replace(' ,',',').strip() if prenote: prenote='['+prenote+']' if postnote: postnote='['+postnote+']' s.clean=(prenote+clean.replace(' ','')+postnote).strip().replace(' ,',',') s.latex=s.clean if 'c.' in s.latex: # \circa s.latex,k=re_circa.subn(r'\circa{\1}',s.latex,count=2) if '--' not in s.latex: s.latex=s.latex.replace('-','--') s.latex=s.latex.replace(u'–','--') s.latex=s.latex.replace('<','$<$').replace('>','$>$').replace('x',r'$\times$') def get_ok(s): return s.ok def get_sortkey(s): return s.sortkey def get_clean(s): return s.clean def get_wordy(s): return s.wordy def get_latex(s): return s.latex def __repr__(s): return "DDD('%s')"%(s.dd,) def __str__(s): return "DD('%s')"%(s.dd,) def __lt__(s,t): return s.sortkey\n%-26s\t%4s\t%-22s\t%s\n'%('input','sort','normalized output','verbose output',)) print('-'*120) f=open(fn,'r') for line in f: if '#'==line[0]: html.write('\n%s\n'%line[:-1]) continue dd=DDD(line.strip('\n')) if not dd.get_ok(): print('"%s" failed'%dd) html.write('"%s" failed\n'%dd) else: output_line='%-20s\t%4.0f\t%18s\t%-22s\t%s'%(line[:-1],dd.get_sortkey(),dd.get_clean(),dd.get_latex(),dd.get_wordy(),) print(output_line) html_line='%-26s\t%4.0f\t%-22s\t%s'%(line[:-1],dd.get_sortkey(),dd.get_clean(),dd.get_wordy(),) html_line=html_line.replace('<','<').replace('>','>').replace(u'–','–') html.write(html_line+'\n') if dd.get_sortkey()<500: exit() f.close() html.write('') html.close() if __name__=='__main__': test_01()