#!/usr/bin/python2.3 # -*- coding: UTF-8 -*- """ HTML to FictionBook converter. """ """ Chris TODO * output encoding, only really works if output is the same as system encoding. E.g. windows (command line) ascii, for most out-of-linux utf8 this is due to use of str types contructed on the fly in the encoding specified in command line and then implict conversion from str to Unicode (joining str to Unicode type, needs explict decode) * uriparse lib for proper URL parsing * images * check support for links to images (as well as alterative text display/processing) * handle images with url encoding, e.g. "my%20pic.jpg" versus "my pic.jpg" see url/uri above.. * add a check "if image not (png or jpeg) convert to PNG"? Most readers seem to only support png and jpeg * check/change encoding of source to be (7 bit) ASCII, make this the default (instead if Windows-1252) * remove getopt, use: optparse, (my modified) wxoptparse or my document optparse * "em dash" only handles for html name tag "—", does not handle at all — and unicode/xml literal — is displayed as text of that! probably due to use of SGML parser which isn't that flexible * test multi document html input * test href properly * href bug if --not-convert-quotes is not issued, get ">>" in href!! * -i bug not working properly if file does not exist (uses stdin) - Low priority as file logic removed from class, html2fb does NOT suffer from this * -o bug not working properly if file is in use, get no error/exception displayed and instead got to stdout - Low priority as file logic removed from class, html2fb does NOT suffer from this * TODO check python coding style/guide for open() versus file() -- http://mail.python.org/pipermail/python-dev/2004-July/045928.html * pep8 where appropriate (note use 120 cols not 80) * pychecker/pylint * process() open() has except without type, should have type and not ignore everything * convert_to_fb() out_file=file() also has except without type, should have type and not ignore everything * convert2png has try/except with no restrictions, errors are lost. * stuff before first header is book description: on/off * remove SGMLParser? replace with Beautiful Soup http://www.crummy.com/software/BeautifulSoup/ * chardet suport - http://chardet.feedparser.org/ * support of non ascii chracter (e.g. Unicode) like "...", mdash, etc. open/close quotes, check HaliReader on pocketPC (suspect missing unicode font) and import my mapping code """ from sgmllib import SGMLParser import sys from types import DictType, TupleType, ListType import re import tempfile import os import base64 import time import urllib import locale import shutil import cStringIO import fb_utils import time import codecs version='0.1.1' # Module wide values _SENTENCE_FIN = u'\'".!?:\xBB\u2026' # \xBB == >>, \u2026 == ... _HEAD_CHARS = u'0123456789VXILM*@' _DIAL_START = u'-\u2013\u2014' # MINUS, N DASH, M DASH _DIAL_START2 = u')-\u2013\u2014 .;' _CH_REPL_AMP = u'\x00' _CH_REPL_LT = u'\x01' _CH_REPL_GT = u'\x02' _CH_LEFT_Q = u'\\1\xab' _CH_RIGHT_Q = u'\xbb\\1' _CH_FLOW = u' ' _CH_DOTS = u'\u2026' _CH_TIRE = u'\u2013' _RE_LQUOTES = re.compile('([ (;-])"') _RE_RQUOTES = re.compile('"([ <&.,;?!)-])') _RE_LQUOTES2 = re.compile('^((?:<[^>]*>)*)"',re.M) _RE_RQUOTES2 = re.compile('"((?:<[^>]*>)*)$',re.M) _RE_TAG = re.compile('<[^>]*>') _RE_ROMAN = re.compile('^m?m?m?(c[md]|d?c{0,3})(x[lc]|l?x{0,3})(i[xv]|v?i{0,3})$', re.I) _RE_EL = re.compile('\s*()') # Flags _TAG_SKIP = 0x0001 _TAG_STRONG = 0x0004 _TAG_EM = 0x0008 _TAG_NOTSKIP = 0x0010 _TAG_ENDP = 0x0020 _TAG_STARTP = 0x0002 _TAG_PRE = 0x0040 _TAG_HEADER = 0x0080 _TAG_INP = 0x0100 _TAG_ID = 0x0200 _TAGS={ 'a' : _TAG_INP, 'abbr' : 0, 'acronym' : 0, 'address' : 0, 'align' : 0, 'applet' : 0, 'area' : 0, 'b' : _TAG_STRONG, 'base' : 0, 'basefont' : 0, 'bdo' : 0, 'bgsound' : 0, 'big' : 0, 'blink' : 0, 'blockquote' : 0, 'body' : 0, 'br' : _TAG_STARTP|_TAG_ENDP, 'button' : 0, 'caption' : 0, 'center' : 0, 'cite' : _TAG_EM|_TAG_ID, 'code' : 0, 'col' : 0, 'colgroup' : 0, 'comment' : 0, 'dd' : 0, 'del' : 0, 'dfn' : 0, 'dir' : 0, 'div' : 0, 'dl' : 0, 'dt' : 0, 'em' : _TAG_EM, 'embed' : 0, 'fieldset' : 0, 'font' : 0, 'form' : _TAG_SKIP, 'frame' : 0, 'frameset' : 0, 'h1' : _TAG_HEADER, 'h2' : _TAG_HEADER, 'h3' : _TAG_HEADER, 'h4' : _TAG_HEADER, 'h5' : _TAG_HEADER, 'h6' : _TAG_HEADER, 'head' : _TAG_SKIP, 'hr' : _TAG_ENDP, 'html' : 0, 'i' : _TAG_EM, 'iframe' : 0, 'ilayer' : 0, 'img' : _TAG_ENDP, 'input' : 0, 'ins' : 0, 'isindex' : 0, 'kbd' : 0, 'keygen' : 0, 'label' : 0, 'layer' : 0, 'legend' : 0, 'li' : 0, 'link' : 0, 'listing' : _TAG_PRE, 'map' : 0, 'marquee' : 0, 'menu' : 0, 'meta' : 0, 'multicol' : 0, 'nextid' : 0, 'nobr' : 0, 'noembed' : _TAG_SKIP, 'noframes' : _TAG_SKIP, 'nolayer' : 0, 'nosave' : 0, 'noscript' : _TAG_SKIP, 'object' : 0, 'ol' : 0, 'optgroup' : 0, 'option' : 0, 'p' : _TAG_STARTP|_TAG_ENDP, 'param' : 0, 'plaintext' : _TAG_PRE, 'pre' : 0, 'q' : 0, 'rb' : 0, 'rbc' : 0, 'rp' : 0, 'rt' : 0, 'rtc' : 0, 'ruby' : 0, 's' : 0, 'samp' : 0, 'script' : _TAG_SKIP, 'select' : 0, 'server' : 0, 'servlet' : 0, 'small' : 0, 'spacer' : 0, 'span' : 0, 'strike' : 0, 'strong' : _TAG_STRONG|_TAG_INP, 'style' : _TAG_SKIP, 'sub' : 0, 'sup' : 0, 'table' : 0, 'tbody' : 0, 'td' : 0, 'textarea' : 0, 'tfoot' : 0, 'th' : 0, 'thead' : 0, 'title' : _TAG_NOTSKIP, 'tr' : _TAG_STARTP, 'tt' : 0, 'u' : 0, 'ul' : 0, 'var' : _TAG_EM, 'wbr' : 0, 'xmp' : _TAG_PRE, #fb2 tags. ignored while parsing 'emphasis' : _TAG_INP, 'section' : _TAG_ID, 'poem' : _TAG_ID, 'epigraph' : _TAG_ID, } from PIL import Image class binary(object): def __init__(self): self.f = os.tmpfile() self.ids = [] def get(self): self.f.seek(0) return self.f def add(self, type, id, data): if id not in self.ids: self.f.write('' % (type, id)) self.f.write(base64.encodestring(data)) self.f.write('\n') self.ids.append(id) class MyHTMLParser(SGMLParser): """HTML parser. Originated from standard htmllib """ from htmlentitydefs import name2codepoint entitydefs={} for (name, codepoint) in name2codepoint.iteritems(): entitydefs[name] = unichr(codepoint) del name, codepoint, name2codepoint entitydefs['nbsp']=u' ' def reset(self): SGMLParser.reset(self) self.nofill = 1 # PRE active or not self.oldnofill = 0 # for saving nofill flag (for section title, for example) self.out = [] # Result self.data = '' # Currently parsed text data self.skip = '' # Skip all all between tags. End tag here self.nstack = [[],[]] # Stack for nesting tags control. first el. is tags stack, second - correspond. attrs self.save = '' # Storage for data between tags pair self.saving = False # Saving in progress flag self.ishtml = False # data type self.asline = (0,0,0) # [counted lines, > 80, < 80] self.ids = {} # links ids self.nextid=1 # next note id self.notes=[] # notes self.descr={} # description self.informer=None # informer (for out messages) def handle_charref(self, name): """Handle decimal escaped character reference, does not handle hex. E.g. “Quoted.” Fred’s car.""" # Modified version of Python 2.3 SGMLParser class # to fix  , etc. (named escape) ascii decode error # as well as “, etc. (decimal escape) ascii decode error try: n = int(name) except ValueError: self.unknown_charref(name) return self.handle_data(unichr(n)) def process(self, params): '''Main processing method. Process all data ''' self.params=params self._TAGS = _TAGS self.rez_descr = fb_utils.description() self.binary = binary() if 'informer' in params: self.informer=params['informer'] if self.params['convert-span-to'] == 'emphasis' or self.params['convert-span-to'] == 'em': self.params['convert-span-to'] == 'em' elif self.params['convert-span-to'] != 'strong': self.params['convert-span-to'] = None if self.params['convert-span-to'] is not None: self._TAGS['span'] = self._TAGS[self.params['convert-span-to']] secs = time.time() self.msg('HTML to FictionBook converter, ver. %s\n' % version) self.msg("Reading data...\n") ## ## use basename for href finding, could change regex instead? #self.msg('process:'+unicode(params['file-name'], params['sys-encoding'])) ##self.href_re = re.compile(".*?%s#(.*)" % unicode(params['file-name'], params['sys-encoding'])) self.href_re = re.compile(".*?%s#(.*)" % unicode(os.path.basename(params['file-name']), params['sys-encoding'])) self.source_directoryname = unicode(os.path.dirname(params['file-name']), params['sys-encoding']) ## try: self.header_re = params['header-re'].strip() and re.compile(params['header-re']) except: self.header_re = None outs = [] descrs = [] for source_file_path in params['source_file_path']: self.reset() self.source_file_path = source_file_path data = codecs.open(source_file_path, 'r', 'utf-8').read() if not data: return '' self.msg('Preprocessing...\n') data = self.pre_process(data) self.msg('Parsing...\n') self.feed(data+'

') self.close() self.msg('Formatting...\n') self.detect_epigraphs() self.detect_verses() self.detect_paragraphs() self.msg('Postprocessing...\n') self.post_process() self.msg('Building result document...\n') outs.append(self.out) descrs.append(self.descr) out = ('\n' \ '\n' % \ self.params['encoding-to'] + \ self.make_description(descrs) + \ '\n%s\n\n' % ''.join(outs) + \ self.make_notes()) out = re.sub(r"(?sm)(\s*\s*)", r"", out) out = out.encode(self.params['encoding-to'],'xmlcharrefreplace') params['file_out'].write(out) shutil.copyfileobj(self.binary.get(), params['file_out']) #params['file_out'].write(self.binary.get()) params['file_out'].write('') self.msg("Total process time is %.2f secs\n" % (time.time() - secs)) return True # --- Tag handling, need for parsing def unknown_starttag(self, tag, attrs): ''' Handle unknown start ttag ''' if tag in self._TAGS or self.skip: self.handle_starttag(tag, None, attrs) else: self.handle_data(self.tag_repr(tag, attrs)) def unknown_endtag(self, tag): ''' Handle unknown end ttag ''' if tag in self._TAGS or self.skip: self.handle_endtag(tag, None) else: self.handle_data("" % tag) def handle_data(self, data): ''' Handle data stream ''' data = data.replace('&',_CH_REPL_AMP).replace('<',_CH_REPL_LT).replace('>',_CH_REPL_GT) if self.saving: self.save += data else: self.data += data def handle_starttag(self, tag, method, attrs): ''' Handle all start tags ''' try: flag = self._TAGS[tag] except: flag = 0 if self.skip and not flag & _TAG_NOTSKIP: return if flag & _TAG_SKIP: self.skip = tag if not method: if flag & _TAG_EM: method = self.start_em if flag & _TAG_STRONG: method = self.start_strong if flag & _TAG_PRE: method = self.start_pre if flag & _TAG_STARTP: method = self.do_p if flag & _TAG_HEADER: method = self.start_h1 if method: method(attrs) # if detected tag, but text still non-html - set text as html if not self.ishtml and \ not flag & (_TAG_EM|_TAG_STRONG|_TAG_INP) and \ tag != 'h6': self.end_paragraph() self.ishtml = True self.nofill = 0 def handle_endtag(self, tag, method): ''' Handle all end tags ''' try: flag = self._TAGS[tag] except: flag = 0 if self.skip and self.skip == tag: self.skip='' self.data='' return if not method: if flag & _TAG_EM: method=self.end_em if flag & _TAG_STRONG: method=self.end_strong if flag & _TAG_PRE: method=self.end_pre if flag & _TAG_ENDP: method = self.end_paragraph if flag & _TAG_HEADER: method = self.end_h1 if method: method() def start_title(self, attrs): ''' Save document title - start''' self.start_saving() def end_title(self): ''' End saving document title ''' self.descr['title'] = ' '.join(self.stop_saving().split()).strip() def do_meta(self, attrs): ''' Handle meta tags - try get document author ''' name='' content='' for opt, val in attrs: if opt=='name': name=val elif opt=='content': content=val.strip() if name=='author' and content: self.descr['author']=content def do_p(self, attrs): '''Handle tag P''' self.end_paragraph() self.mark_start_tag('p') def start_pre(self, attrs): ''' Handle tag PRE ''' self.nofill = self.nofill + 1 self.do_p(None) def end_pre(self): ''' Handle tag /PRE ''' self.end_paragraph() self.nofill = max(0, self.nofill - 1) def start_em(self, attrs): ''' Handle tag EM ''' self.mark_start_tag('emphasis') def end_em(self): ''' Handle tag /EM ''' self.mark_end_tag('emphasis') def start_strong(self, attrs): ''' Handle tag STRONG ''' self.mark_start_tag('strong') def end_strong(self): ''' Handle tag /STRONG ''' self.mark_end_tag('strong') def start_a(self, attrs): ''' Handle tag A ''' for attrname, value in attrs: value = value.strip() if attrname == 'href': res = self.href_re.match(value) if res: value=self.make_id(res.group(1)) try: self.ids[value][1]+=1 except: self.ids[value]=[0,1] value="#"+value if self.params['skip-ext-links'] and not res: return self.mark_start_tag('a', [('xlink:href',value)]) if attrname == 'name': value = self.make_id(value) self.data+="" % value try: self.ids[value][0]+=1 except: self.ids[value]=[1,0] def end_a(self): ''' Handle tag /A ''' self.mark_end_tag('a') def start_h1(self, attrs): ''' Handle tag H1-H6 ''' self.end_paragraph() self.out.extend(['','
','']) self.mark_start_tag('p') self.oldnofill, self.nofill = self.nofill, 0 def end_h1(self): ''' Handle tag /H1-/H6 ''' self.end_paragraph() self.out.append('') self.nofill = self.oldnofill self.mark_start_tag('p') def do_img(self, attrs): ''' Handle images ''' if self.params['skip-images']: return src = None for attrname, value in attrs: if attrname == 'src': src = value if src: temp_image_filename= os.path.join(os.path.split(self.source_file_path)[0], urllib.unquote(src)) img = self.convert_image(temp_image_filename)#src.encode(self.params['sys-encoding'])) if img: self.end_paragraph() src=os.path.basename(src) self.out.append(self.tag_repr('image', [('xlink:href','#'+src)], True)) self.binary.add('image/%s' % img['type'], src, img['data']) def report_unbalanced(self, tag): ''' Handle unbalansed close tags''' self.handle_data('\n' % tag) def unknown_charref(self, ref): ''' Handle unknown char refs ''' # FIX: Don't know, how to handle it self.msg('Unknown/invalid char ref %s is being ignored\n' % ref) raise(Warning, 'Unknown char ref %s is being ignored\n' % ref) def unknown_entityref(self, ref): ''' Handle unknown entity refs ''' # FIX: Don't know, how to handle it self.msg('Unknown entity ref %s\n' % ref, 1) # --- Methods for support parsing def start_saving(self): ''' Not out data to out but save it ''' self.saving = True self.save = '' def stop_saving(self): ''' Stop data saving ''' self.saving = False return self.save def end_paragraph(self): ''' Finalise paragraph ''' if not self.data.strip(): try: p = self.nstack[0].index('p') if self.out[-1] == '

' or not self.out[-1]: if self.params['skip-empty-lines']: self.out.pop() else: self.out[-1] = "" self.nstack[0]=self.nstack[0][:p] self.nstack[1]=self.nstack[1][:p] else: self.mark_end_tag('p') except ValueError: pass else: if 'p' not in self.nstack[0]: self.nstack[0][0:0]='p' self.nstack[1][0:0]=[None] self.out.append('

') self.mark_end_tag('p') def mark_start_tag(self, tag, attrs=None): ''' Remember open tag and put it to output ''' try: flag = self._TAGS[tag] except: flag = 0 if tag in self.nstack[0]: self.mark_end_tag(tag) self.nstack[0].append(tag) self.nstack[1].append(attrs) if flag & _TAG_INP: self.data += self.tag_repr(tag, attrs) else: self.out.append(self.tag_repr(tag, attrs)) def mark_end_tag(self, tag): ''' Close corresponding tags. If tag is not last tag was outed, close all previously opened tags. I.e. text -> text ''' if tag not in self.nstack[0]: return while self.nstack[0]: v = self.nstack[0].pop() a = self.nstack[1].pop() try: flag = self._TAGS[v] except: flag = 0 if flag & _TAG_INP: et=self.tag_repr(v,a) if self.data.rstrip().endswith(et): self.data=self.data.rstrip()[:-len(et)] if v=='a': try: self.ids[a[0][1]][1]-=1 except: pass else: self.data += "" % v else: self.process_data() if self.out[-1]=="<%s>" % v: ## duplicate tag detected, often from replacing embedded

inside ... ## FIXME not really sure if this is 100% appropriate, is this ONLY for missing target links? self.msg("!!!!\n") self.msg('DEBUG** ' + repr(v) + ' --- ' + repr(self.out[-10:]) + ' ' + repr(v), 1) self.out.pop() else: self.out.append("" % v) if tag == v: break def process_data(self): ''' Handle accomulated data when close paragraph. ''' if not self.data.strip(): return if not self.nofill: self.data=_CH_FLOW+self.data.strip() self.out.append(self.data) else: self.data = self.process_pre(self.data) self.data = self.detect_headers(self.data) try: if self.data[0]=='

' and self.out[-1]=='

': self.out.pop() self.data = self.data[1:] msg("WoW! I must be impossible!!!") except IndexError: pass self.out.extend(self.data) self.data = '' # --- Parsed data processing methods def pre_process(self, data): data=data.replace(u'\x0e',u'

').replace(u'\x0f',u'
') for i in u'\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x10\x11'\ '\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f': data = data.replace(i,u' ') return data def post_process(self): ''' Last processing method ''' id = '' for i in range(len(self.out)): if not self.out[i]: pass # skip empty lines (can apper below, where ...out[i+1]='') elif self.out[i][0] != '<': id, p = self.process_paragraph(self.out[i], id) if p: if id: self.out[i-1] = '<%s id="%s">' % (self.out[i-1][1:-1],id) id = '' self.out[i] = p else: self.out[i-1]=self.out[i]=self.out[i+1]='' # remove empty paragraph elif id: try: if self._TAGS[self.out[i][1:-1]] & _TAG_ID: self.out[i] = '<%s id="%s">' % (self.out[i][1:-1],id) id = '' except KeyError: pass #join with dont use newline in

and tags out_tmp = '' for i in xrange(len(self.out)): is_new_line = True if i >= (len(self.out) -1 ): is_new_line = False elif self.out[i].startswith('=0]: self.out=re.sub("%s(.*?)%s" % (self.tag_repr('a',[('xlink:href','#'+i)]),'') ,r'\1',self.out) sect=self.out.find('').split()) self.out=self.out[sect:] else: if self.out.startswith('

'): self.out=self.out[len(''):] else: self.out='
\n'+self.out self.out+='\n
' self.out=_RE_EL.sub(r'\1',self.out) if self.out.startswith('
'): title = 'title' in self.descr and self.descr['title'] or '' if title: self.out = '
' + '' + title + '' + self.out[len('
'):] def detect_headers(self, data): ''' Find headers in plain text. ''' if not self.params['detect-headers']: return [data] res = [] pstart = i = 0 header = ['

', '
', '
', '', '<p>', '', # place for title (5) '</p>', '', '

'] while i < len(data)-1: empty0 = not data[i] try: empty1 = not data[i+1] empty2 = not data[i+2] empty3 = not data[i+3] except IndexError: empty1 = empty2 = empty3 = False if empty0 and empty1 and not empty2 and empty3: res.append(data[pstart:i]) header[5]=_CH_FLOW + data[i+1].strip() res.extend(header) i+=2 pstart = i+2 else: istitle = ( empty0 and not empty1 and empty2 and ( empty3 or data[i+1].startswith(' '*8) or data[i+1].isupper() or ( data[i+1].lstrip()[0] not in _DIAL_START and data[i+1][-1] not in _SENTENCE_FIN ) or data[i+1].lstrip()[0] in _HEAD_CHARS or self.is_roman_number(data[i+1]) ) ) istitle = istitle or \ not empty1 and \ self.header_re and \ self.header_re.match(data[i+1]) if istitle: res.append(data[pstart:i]) header[5]=_CH_FLOW + data[i+1].strip() res.extend(header) i+=1 while i < len(data)-1 and not data[i+1]: i+=1 pstart = i+1 i+=1 if pstart < len(data): res.append(data[pstart:]) return res def detect_epigraphs(self): ''' Detect epigraphs (in plain text) ''' if not self.params['detect-epigraphs']: return sect_found = 0 i = 0 while i < len(self.out): if type(self.out[i]) != ListType: if self.out[i] == '

': sect_found = 1 elif self.out[i] == '': sect_found = sect_found and 2 or 0 elif self.out[i] == '': sect_found = sect_found and 1 or 0 elif self.out[i][0] != '<': sect_found = sect_found!=1 and 2 or 0 else: if sect_found == 1: res = [] raw = self.out[i] lraw = len(raw) j=0 eplines = 0 epfound = 0 while j < len(raw): while j < lraw and not raw[j]: j+=1 # skip empty lines eep = -1 # search empty line for k in range(j,j+60): if k >= lraw or not raw[k]: eep = k break if eep == j: break if eep >= 0: eplines = 0 for k in range(j,eep): rawk = raw[k].lstrip() if ' '*10 in raw[k] or len(rawk) < 60: eplines +=1 if len(rawk) > 60: eplines -= 5 if rawk and ( rawk[0] in _DIAL_START or rawk[0].isdigit() and len(rawk)>2 and rawk[1] in _DIAL_START2 ): eplines -= 1 if (float(eplines)/(eep-j)>0.8): epfound += 1 author = eep-j > 1 res.extend(['','

',raw[j:eep-author]]) if author and self.clean_str(raw[eep-1]).lstrip()[0].isupper(): res.extend(['

','',_CH_FLOW + self.clean_str(raw[eep-1]).lstrip(),'']) else: if author: res[-1].append(raw[eep-1]) res.append('

') res.append('
') j=eep else: eep = -1 if eep < 0: break j+=1 if epfound: istart=i iend=i+1 if i and self.out[i-1] == '

': istart-=1 if j < len(raw)-1: res.extend(['

',raw[j:]]) elif i < len(self.out)-1 and self.out[i+1] == '

': iend+=1 self.out[istart:iend] = res i = istart + len(res)-1 sect_found = 0 i += 1 def detect_verses(self): ''' Detect verses in plain text ''' if not self.params['detect-verses']: return i=0 while i < len(self.out): if type(self.out[i]) == ListType: res=[] raw=[self.clean_str(x).rstrip() for x in self.out[i]] lraw=len(raw) pfound = jstart = j = 0 while j < lraw-3: if raw[j] and len(raw[j]) < 60 and \ raw[j+1] and len(raw[j+1]) < 80 and \ raw[j+2] and len(raw[j+2]) < 80 and \ raw[j+3] and len(raw[j+3]) < 80: fl = len(raw[j]) k = j while k < lraw: if raw[k].strip() and ( abs(len(raw[k])-fl) > 15 or \ raw[k].lstrip()[0] in _DIAL_START ): break k += 1 if k - j > 3: pfound += 1 if jstart: res.append('

') if jstart != j: res.extend([self.out[i][jstart:j],'

']) res.extend(['','']) for l in range(j,k): if raw[l]: res.extend(['',raw[l].lstrip(),'']) elif l < k-1 and res[-1] != '': res.extend(['','']) res.extend(['','']) j=k-1 jstart = k j+=1 if pfound: if jstart < lraw-1: res.extend(['

',self.out[i][jstart:]]) istart = i iend = i+1 try: if res[0] == '' and self.out[i-1] == '

': istart -= 1 except: pass try: if res[-1] == '' and self.out[i+1] == '

': iend += 1 except: pass self.out[istart:iend]=res i+=1 def detect_paragraphs(self): ''' Detect paragraphs in plain text ''' i=0 while i < len(self.out): if type(self.out[i]) == ListType: res = [] raw = self.out[i] j = 0 pfound = 0 while j < len(raw) and not raw[j]: j+=1 jstart = j while j < len(raw): if not raw[j]: try: while not raw[j]: j+=1 except IndexError: break if not self.params['skip-empty-lines']: res.append('') jstart=j continue elif self.asline or \ not self.params['detect-paragraphs'] or \ j >= len(raw)-1 or \ not raw[j+1].lstrip() or \ (raw[j+1].lstrip()[0] in _DIAL_START or \ raw[j+1].startswith(' ') ) and raw[j][-1] in _SENTENCE_FIN: pfound += 1 res.extend(['

',_CH_FLOW + '\n'.join(raw[jstart:j+1]),'

']) #res.extend(['

',_CH_FLOW + ''.join(raw[jstart:j+1]),'

']) jstart = j+1 j+=1 if pfound > 0: self.out[i:i+1]=res[1:-1] i+=len(res)-2 else: self.out[i]='\n'.join(raw).lstrip() i+=1 def detect_italic(self, text, arg): signs='_.,!?:' istart=-1 res='' while True: istart = text.find('_') if istart >= 0: iend = sys.maxint for i in signs: try: iend=min(iend, text.index(i, istart+1)) except: pass if iend == sys.maxint: iend=0 if text[istart+1:iend or None]: emp = ''+ \ text[istart+1:iend or None]+ \ '' else: emp = '' text = text[:istart]+ \ emp+ \ (iend and text[iend+(text[iend]=='_'):] or '') else: break return text def detect_notes(self, text, arg): while True: snote = text.find(arg[0]) enote = text.find(arg[1]) if snote <0 or enote <= snote: break self.notes.append((self.nextid, text[snote+1:enote])) text=(text[:snote] + '' % self.nextid + "note %s" % self.nextid+"" + text[enote+1:] ) self.nextid += 1 return text def process_pre(self, data): ''' Process preformatted data (data between
 and 
tag or plain text file) Determine text format. ''' data = [x.rstrip() for x in data.splitlines()] if type(self.asline) == TupleType: count,G80,L80 = self.asline for i in data: if len(i) > 80: G80+=1 else: L80+=1 count+=1 if count > 2000: self.asline = G80 > L80 break if type(self.asline) == TupleType: self.asline=(count, G80, L80) return data def process_paragraph(self, paragraph, id): ''' Process paragraph. Find id, normalize quotes. ''' paragraph=paragraph.strip() startp = paragraph.find('= 0: endp=paragraph.index('>',startp+4) # if '>' will not be found exception will raised, because use index found_id = paragraph[startp+4:endp] if not id: id = found_id else: self.ids[found_id][0]=0 paragraph=paragraph[:startp]+paragraph[endp+1:] startp = paragraph.find('') paragraph=paragraph[:endp+1]+paragraph[endp+1:].lstrip() if self.params['convert-quotes']: # process quotes paragraph = _RE_LQUOTES.sub(_CH_LEFT_Q, paragraph) paragraph = _RE_RQUOTES.sub(_CH_RIGHT_Q, paragraph) paragraph = _RE_LQUOTES2.sub(_CH_LEFT_Q, paragraph) paragraph = _RE_RQUOTES2.sub(_CH_RIGHT_Q, paragraph) if self.params['detect-notes']: paragraph = self.process_nontags(paragraph, self.detect_notes, "[]") paragraph = self.process_nontags(paragraph, self.detect_notes, "{}") if self.params['detect-italic']: paragraph = self.process_nontags(paragraph, self.detect_italic, None) paragraph = ' '.join(paragraph.split()) # Remove extra whitespaces return [id, paragraph] def process_nontags(self, text, func, arg): ss=0 res = '' w = '' while 0 <= ss < len(text): try: i = text.index('<',ss) w=text[ss:i] ss = i except: w=text[ss:] ss = -1 if w.strip(): # process text between tagtext if any. res += func(w, arg) else: res += ' ' if ss >= 0: i = text.index('>',ss) res+=text[ss:i+1] ss=i+1 return res # --- Make out document methods def make_description(self, descrs): if self.params['descr'].selfdetect: titles = [] authors = [] for descr in descrs: title = 'title' in descr and descr['title'] or '' author = 'author' in descr and descr['author'] or '' if not author and '.' in title : point = title.index('.') author = title[:point].strip() title = title[point+1:].strip() author = author.split() a ={ 'first': author and author[0] or '', 'middle': len(author) > 2 and author[1] or '', 'last': len(author) > 2 and author[2] or (len(author) > 1 and author[1] or '') } if a not in authors: authors.append(a) if title not in titles: titles.append(title) title = ' ||| '.join(titles) genre = self.params['descr'].genre else: title = self.params['descr'].title authors = self.params['descr'].authors genre = self.params['descr'].genre annotations = [] for descr in descrs: if descr.get('annot', ''): annotations.append(descr.get('annot', '')) self.rez_descr.authors = authors self.rez_descr.title = title self.rez_descr.genre = genre auth_str = '' for author in authors: auth_str += "" auth_str += "%s" % author['first'] auth_str += "%s" % author['middle'] auth_str += "%s" % author['last'] auth_str += "\n" retv = '\n' #fill title-info retv += '\n' retv += '%s\n' % genre retv += auth_str retv += '%s\n' % title #if annotations: # retv+='%s\n' % '\n\n'.join(annotations) retv += '%s\n' % self.params['descr'].lang self.rez_descr.lang = self.params['descr'].lang retv += '\n' #fill document-info retv += '\n' retv += '\n' if self.params['descr'].program_info != None: retv += '%s\n' % self.params['descr'].program_info #retv += '%s\n' % (time.strftime('%Y-%m-%d'), time.strftime('%d %B %Y')) retv += '%s\n' % (time.strftime('%Y-%m-%d'), time.strftime('%Y-%m-%d')) if self.params['descr'].urls: retv += '%s\n' % ' '.join(self.params['descr'].urls) retv += '%s\n' % self.params['descr'].id retv += '1.0\n' retv += '\n' retv += '\n' return retv def make_notes(self): if not self.notes: return '' retv=['
<p>note %s</p>%s
' % (x,x,y) for x,y in self.notes] return '<p>Notes</p>'+''.join(retv)+'' # --- Auxiliary methods def tag_repr(self, tag, attrs, single=False): ''' Start tag representation ''' closer=single and '/' or '' if attrs: return "<%s %s%s>" % (tag, ' '.join(['%s="%s"' % x for x in attrs if x[1] is not None]),closer) else: return "<%s%s>" % (tag, closer) def clean_str(self, intext): ''' Remove simple tags from line. ''' return _RE_TAG.sub('',intext) def is_roman_number(self, instr): ''' Detect - is instr is roman number ''' instr = self.clean_str(instr).strip() if len(instr)>8: return False return bool(_RE_ROMAN.match(instr)) def msg(self, msg, level=0): if self.informer and self.params['verbose'] > level: self.informer(msg) def make_id(self, id): ''' Make properly link id ''' # FIX: make id later return id def print_out(self, data=None): if data is None: data = self.out for i in data: if type(i) == ListType: print '[' for j in i: print j.encode('koi8-r','replace') print ']' else: print i.encode('koi8-r','replace') def convert_image(self, filename): src = os.path.join(self.source_directoryname, filename) try: im = Image.open(src) except IOError: return None if im.format == 'GIF': f = cStringIO.StringIO() im.save(f, "PNG") return {'data':f.getvalue(), 'type':'png'} elif im.format == 'PNG': try: data = open(src, 'rb').read() except IOError: return None else: return {'data':data, 'type':'png'} elif im.format == 'JPEG': try: data = open(src, 'rb').read() except IOError: return None else: return {'data':data, 'type':'jpeg'} return None def get_descr(self): return self.rez_descr # FIXME TODO, the try block is broken locale.setlocale(locale.LC_ALL, '') try: sys_encoding = locale.nl_langinfo(locale.CODESET) except AttributeError: sys_encoding = "Windows-1251" default_params = { 'file-name' : '', # Input HTML file name, even if not reading from an on-disk file, this should be passed in to aid in href detection 'data' : '', # Data for processing 'encoding-from' : '', # Source data encoding 'encoding-to' : 'Windows-1251', # Result data encoding 'convert-quotes' : 1, # Convert "" to << >> 'convert-hyphen' : 1, # Convert - to ndash 'header-re' : '', # regexp for detecting section headers 'skip-images' : 0, # Ignore images (not include it to result) 'skip-ext-links' : 0, # Ignore external links 'skip-empty-lines' : 1, # Not generate tags 'detect-italic' : 1, # Detect italc (_italic text here_) 'detect-headers' : 1, # Detect sections headers 'detect-epigraphs' : 1, # Detect epigraphs 'detect-paragraphs' : 1, # Detect paragraphs 'detect-annot' : 1, # Detect annotation 'detect-verses' : 1, # Detect verses 'detect-notes' : 1, # Detect notes ([note here] or {note here}) 'verbose' : 1, # Verbose level 'sys-encoding': sys_encoding, 'informer': sys.stderr.write, 'convert-span-to': None, # what to convert span tags to, if set to 'em' or 'emphasis' converts spans to 'emphasis', if 'strong' converts to 'strong', anything else is ignored/skipped/removed (silently) 'descr': None }