#!/usr/bin/python2.3
# -*- coding: UTF-8 -*-
"""
HTML to FictionBook converter.
"""
"""
Chris TODO
* output encoding, only really works if output is the same as system
encoding. E.g. windows (command line) ascii, for most out-of-linux utf8
this is due to use of str types contructed on the fly in the encoding
specified in command line and then implict conversion from str to
Unicode (joining str to Unicode type, needs explict decode)
* uriparse lib for proper URL parsing
* images
* check support for links to images (as well as alterative text display/processing)
* handle images with url encoding, e.g. "my%20pic.jpg" versus "my pic.jpg" see url/uri above..
* add a check "if image not (png or jpeg) convert to PNG"? Most readers seem to only support png and jpeg
* check/change encoding of source to be (7 bit) ASCII, make this the default (instead if Windows-1252)
* remove getopt, use: optparse, (my modified) wxoptparse or my document optparse
* "em dash" only handles for html name tag "—",
does not handle at all —
and unicode/xml literal — is displayed as text of that!
probably due to use of SGML parser which isn't that flexible
* test multi document html input
* test href properly
* href bug if --not-convert-quotes is not issued, get ">>" in href!!
* -i bug not working properly if file does not exist (uses stdin) - Low priority as file logic removed from class, html2fb does NOT suffer from this
* -o bug not working properly if file is in use, get no error/exception displayed and instead got to stdout - Low priority as file logic removed from class, html2fb does NOT suffer from this
* TODO check python coding style/guide for open() versus file() -- http://mail.python.org/pipermail/python-dev/2004-July/045928.html
* pep8 where appropriate (note use 120 cols not 80)
* pychecker/pylint
* process() open() has except without type, should have type and not ignore everything
* convert_to_fb() out_file=file() also has except without type, should have type and not ignore everything
* convert2png has try/except with no restrictions, errors are lost.
* stuff before first header is book description: on/off
* remove SGMLParser? replace with Beautiful Soup http://www.crummy.com/software/BeautifulSoup/
* chardet suport - http://chardet.feedparser.org/
* support of non ascii chracter (e.g. Unicode) like "...", mdash, etc. open/close quotes, check HaliReader on pocketPC (suspect missing unicode font) and import my mapping code
"""
from sgmllib import SGMLParser
import sys
from types import DictType, TupleType, ListType
import re
import tempfile
import os
import base64
import time
import urllib
import locale
import shutil
import cStringIO
import fb_utils
import time
import codecs
version='0.1.1'
# Module wide values
_SENTENCE_FIN = u'\'".!?:\xBB\u2026' # \xBB == >>, \u2026 == ...
_HEAD_CHARS = u'0123456789VXILM*@'
_DIAL_START = u'-\u2013\u2014' # MINUS, N DASH, M DASH
_DIAL_START2 = u')-\u2013\u2014 .;'
_CH_REPL_AMP = u'\x00'
_CH_REPL_LT = u'\x01'
_CH_REPL_GT = u'\x02'
_CH_LEFT_Q = u'\\1\xab'
_CH_RIGHT_Q = u'\xbb\\1'
_CH_FLOW = u' '
_CH_DOTS = u'\u2026'
_CH_TIRE = u'\u2013'
_RE_LQUOTES = re.compile('([ (;-])"')
_RE_RQUOTES = re.compile('"([ <&.,;?!)-])')
_RE_LQUOTES2 = re.compile('^((?:<[^>]*>)*)"',re.M)
_RE_RQUOTES2 = re.compile('"((?:<[^>]*>)*)$',re.M)
_RE_TAG = re.compile('<[^>]*>')
_RE_ROMAN = re.compile('^m?m?m?(c[md]|d?c{0,3})(x[lc]|l?x{0,3})(i[xv]|v?i{0,3})$', re.I)
_RE_EL = re.compile('\s*()')
# Flags
_TAG_SKIP = 0x0001
_TAG_STRONG = 0x0004
_TAG_EM = 0x0008
_TAG_NOTSKIP = 0x0010
_TAG_ENDP = 0x0020
_TAG_STARTP = 0x0002
_TAG_PRE = 0x0040
_TAG_HEADER = 0x0080
_TAG_INP = 0x0100
_TAG_ID = 0x0200
_TAGS={
'a' : _TAG_INP,
'abbr' : 0,
'acronym' : 0,
'address' : 0,
'align' : 0,
'applet' : 0,
'area' : 0,
'b' : _TAG_STRONG,
'base' : 0,
'basefont' : 0,
'bdo' : 0,
'bgsound' : 0,
'big' : 0,
'blink' : 0,
'blockquote' : 0,
'body' : 0,
'br' : _TAG_STARTP|_TAG_ENDP,
'button' : 0,
'caption' : 0,
'center' : 0,
'cite' : _TAG_EM|_TAG_ID,
'code' : 0,
'col' : 0,
'colgroup' : 0,
'comment' : 0,
'dd' : 0,
'del' : 0,
'dfn' : 0,
'dir' : 0,
'div' : 0,
'dl' : 0,
'dt' : 0,
'em' : _TAG_EM,
'embed' : 0,
'fieldset' : 0,
'font' : 0,
'form' : _TAG_SKIP,
'frame' : 0,
'frameset' : 0,
'h1' : _TAG_HEADER,
'h2' : _TAG_HEADER,
'h3' : _TAG_HEADER,
'h4' : _TAG_HEADER,
'h5' : _TAG_HEADER,
'h6' : _TAG_HEADER,
'head' : _TAG_SKIP,
'hr' : _TAG_ENDP,
'html' : 0,
'i' : _TAG_EM,
'iframe' : 0,
'ilayer' : 0,
'img' : _TAG_ENDP,
'input' : 0,
'ins' : 0,
'isindex' : 0,
'kbd' : 0,
'keygen' : 0,
'label' : 0,
'layer' : 0,
'legend' : 0,
'li' : 0,
'link' : 0,
'listing' : _TAG_PRE,
'map' : 0,
'marquee' : 0,
'menu' : 0,
'meta' : 0,
'multicol' : 0,
'nextid' : 0,
'nobr' : 0,
'noembed' : _TAG_SKIP,
'noframes' : _TAG_SKIP,
'nolayer' : 0,
'nosave' : 0,
'noscript' : _TAG_SKIP,
'object' : 0,
'ol' : 0,
'optgroup' : 0,
'option' : 0,
'p' : _TAG_STARTP|_TAG_ENDP,
'param' : 0,
'plaintext' : _TAG_PRE,
'pre' : 0,
'q' : 0,
'rb' : 0,
'rbc' : 0,
'rp' : 0,
'rt' : 0,
'rtc' : 0,
'ruby' : 0,
's' : 0,
'samp' : 0,
'script' : _TAG_SKIP,
'select' : 0,
'server' : 0,
'servlet' : 0,
'small' : 0,
'spacer' : 0,
'span' : 0,
'strike' : 0,
'strong' : _TAG_STRONG|_TAG_INP,
'style' : _TAG_SKIP,
'sub' : 0,
'sup' : 0,
'table' : 0,
'tbody' : 0,
'td' : 0,
'textarea' : 0,
'tfoot' : 0,
'th' : 0,
'thead' : 0,
'title' : _TAG_NOTSKIP,
'tr' : _TAG_STARTP,
'tt' : 0,
'u' : 0,
'ul' : 0,
'var' : _TAG_EM,
'wbr' : 0,
'xmp' : _TAG_PRE,
#fb2 tags. ignored while parsing
'emphasis' : _TAG_INP,
'section' : _TAG_ID,
'poem' : _TAG_ID,
'epigraph' : _TAG_ID,
}
from PIL import Image
class binary(object):
def __init__(self):
self.f = os.tmpfile()
self.ids = []
def get(self):
self.f.seek(0)
return self.f
def add(self, type, id, data):
if id not in self.ids:
self.f.write('' % (type, id))
self.f.write(base64.encodestring(data))
self.f.write('\n')
self.ids.append(id)
class MyHTMLParser(SGMLParser):
"""HTML parser.
Originated from standard htmllib
"""
from htmlentitydefs import name2codepoint
entitydefs={}
for (name, codepoint) in name2codepoint.iteritems():
entitydefs[name] = unichr(codepoint)
del name, codepoint, name2codepoint
entitydefs['nbsp']=u' '
def reset(self):
SGMLParser.reset(self)
self.nofill = 1 # PRE active or not
self.oldnofill = 0 # for saving nofill flag (for section title, for example)
self.out = [] # Result
self.data = '' # Currently parsed text data
self.skip = '' # Skip all all between tags. End tag here
self.nstack = [[],[]] # Stack for nesting tags control. first el. is tags stack, second - correspond. attrs
self.save = '' # Storage for data between tags pair
self.saving = False # Saving in progress flag
self.ishtml = False # data type
self.asline = (0,0,0) # [counted lines, > 80, < 80]
self.ids = {} # links ids
self.nextid=1 # next note id
self.notes=[] # notes
self.descr={} # description
self.informer=None # informer (for out messages)
def handle_charref(self, name):
"""Handle decimal escaped character reference, does not handle hex.
E.g. “Quoted.” Fred’s car."""
# Modified version of Python 2.3 SGMLParser class
# to fix , etc. (named escape) ascii decode error
# as well as “, etc. (decimal escape) ascii decode error
try:
n = int(name)
except ValueError:
self.unknown_charref(name)
return
self.handle_data(unichr(n))
def process(self, params):
'''Main processing method. Process all data '''
self.params=params
self._TAGS = _TAGS
self.rez_descr = fb_utils.description()
self.binary = binary()
if 'informer' in params:
self.informer=params['informer']
if self.params['convert-span-to'] == 'emphasis' or self.params['convert-span-to'] == 'em':
self.params['convert-span-to'] == 'em'
elif self.params['convert-span-to'] != 'strong':
self.params['convert-span-to'] = None
if self.params['convert-span-to'] is not None:
self._TAGS['span'] = self._TAGS[self.params['convert-span-to']]
secs = time.time()
self.msg('HTML to FictionBook converter, ver. %s\n' % version)
self.msg("Reading data...\n")
##
## use basename for href finding, could change regex instead?
#self.msg('process:'+unicode(params['file-name'], params['sys-encoding']))
##self.href_re = re.compile(".*?%s#(.*)" % unicode(params['file-name'], params['sys-encoding']))
self.href_re = re.compile(".*?%s#(.*)" % unicode(os.path.basename(params['file-name']), params['sys-encoding']))
self.source_directoryname = unicode(os.path.dirname(params['file-name']), params['sys-encoding'])
##
try:
self.header_re = params['header-re'].strip() and re.compile(params['header-re'])
except:
self.header_re = None
outs = []
descrs = []
for source_file_path in params['source_file_path']:
self.reset()
self.source_file_path = source_file_path
data = codecs.open(source_file_path, 'r', 'utf-8').read()
if not data:
return ''
self.msg('Preprocessing...\n')
data = self.pre_process(data)
self.msg('Parsing...\n')
self.feed(data+'
')
self.close()
self.msg('Formatting...\n')
self.detect_epigraphs()
self.detect_verses()
self.detect_paragraphs()
self.msg('Postprocessing...\n')
self.post_process()
self.msg('Building result document...\n')
outs.append(self.out)
descrs.append(self.descr)
out = ('\n' \
'\n' % \
self.params['encoding-to'] + \
self.make_description(descrs) + \
'\n%s\n\n' % ''.join(outs) + \
self.make_notes())
out = re.sub(r"(?sm)(\s*\s*)", r"", out)
out = out.encode(self.params['encoding-to'],'xmlcharrefreplace')
params['file_out'].write(out)
shutil.copyfileobj(self.binary.get(), params['file_out'])
#params['file_out'].write(self.binary.get())
params['file_out'].write('')
self.msg("Total process time is %.2f secs\n" % (time.time() - secs))
return True
# --- Tag handling, need for parsing
def unknown_starttag(self, tag, attrs):
'''
Handle unknown start ttag
'''
if tag in self._TAGS or self.skip:
self.handle_starttag(tag, None, attrs)
else:
self.handle_data(self.tag_repr(tag, attrs))
def unknown_endtag(self, tag):
'''
Handle unknown end ttag
'''
if tag in self._TAGS or self.skip:
self.handle_endtag(tag, None)
else:
self.handle_data("%s>" % tag)
def handle_data(self, data):
'''
Handle data stream
'''
data = data.replace('&',_CH_REPL_AMP).replace('<',_CH_REPL_LT).replace('>',_CH_REPL_GT)
if self.saving:
self.save += data
else:
self.data += data
def handle_starttag(self, tag, method, attrs):
'''
Handle all start tags
'''
try:
flag = self._TAGS[tag]
except:
flag = 0
if self.skip and not flag & _TAG_NOTSKIP:
return
if flag & _TAG_SKIP:
self.skip = tag
if not method:
if flag & _TAG_EM:
method = self.start_em
if flag & _TAG_STRONG:
method = self.start_strong
if flag & _TAG_PRE:
method = self.start_pre
if flag & _TAG_STARTP:
method = self.do_p
if flag & _TAG_HEADER:
method = self.start_h1
if method:
method(attrs)
# if detected tag, but text still non-html - set text as html
if not self.ishtml and \
not flag & (_TAG_EM|_TAG_STRONG|_TAG_INP) and \
tag != 'h6':
self.end_paragraph()
self.ishtml = True
self.nofill = 0
def handle_endtag(self, tag, method):
'''
Handle all end tags
'''
try:
flag = self._TAGS[tag]
except:
flag = 0
if self.skip and self.skip == tag:
self.skip=''
self.data=''
return
if not method:
if flag & _TAG_EM:
method=self.end_em
if flag & _TAG_STRONG:
method=self.end_strong
if flag & _TAG_PRE:
method=self.end_pre
if flag & _TAG_ENDP:
method = self.end_paragraph
if flag & _TAG_HEADER:
method = self.end_h1
if method:
method()
def start_title(self, attrs):
''' Save document title - start'''
self.start_saving()
def end_title(self):
''' End saving document title '''
self.descr['title'] = ' '.join(self.stop_saving().split()).strip()
def do_meta(self, attrs):
'''
Handle meta tags - try get document author
'''
name=''
content=''
for opt, val in attrs:
if opt=='name':
name=val
elif opt=='content':
content=val.strip()
if name=='author' and content:
self.descr['author']=content
def do_p(self, attrs):
'''Handle tag P'''
self.end_paragraph()
self.mark_start_tag('p')
def start_pre(self, attrs):
''' Handle tag PRE '''
self.nofill = self.nofill + 1
self.do_p(None)
def end_pre(self):
''' Handle tag /PRE '''
self.end_paragraph()
self.nofill = max(0, self.nofill - 1)
def start_em(self, attrs):
''' Handle tag EM '''
self.mark_start_tag('emphasis')
def end_em(self):
''' Handle tag /EM '''
self.mark_end_tag('emphasis')
def start_strong(self, attrs):
''' Handle tag STRONG '''
self.mark_start_tag('strong')
def end_strong(self):
''' Handle tag /STRONG '''
self.mark_end_tag('strong')
def start_a(self, attrs):
''' Handle tag A '''
for attrname, value in attrs:
value = value.strip()
if attrname == 'href':
res = self.href_re.match(value)
if res:
value=self.make_id(res.group(1))
try:
self.ids[value][1]+=1
except:
self.ids[value]=[0,1]
value="#"+value
if self.params['skip-ext-links'] and not res:
return
self.mark_start_tag('a', [('xlink:href',value)])
if attrname == 'name':
value = self.make_id(value)
self.data+="" % value
try:
self.ids[value][0]+=1
except:
self.ids[value]=[1,0]
def end_a(self):
''' Handle tag /A '''
self.mark_end_tag('a')
def start_h1(self, attrs):
''' Handle tag H1-H6 '''
self.end_paragraph()
self.out.extend(['','',''])
self.mark_start_tag('p')
self.oldnofill, self.nofill = self.nofill, 0
def end_h1(self):
''' Handle tag /H1-/H6 '''
self.end_paragraph()
self.out.append('')
self.nofill = self.oldnofill
self.mark_start_tag('p')
def do_img(self, attrs):
''' Handle images '''
if self.params['skip-images']:
return
src = None
for attrname, value in attrs:
if attrname == 'src':
src = value
if src:
temp_image_filename= os.path.join(os.path.split(self.source_file_path)[0], urllib.unquote(src))
img = self.convert_image(temp_image_filename)#src.encode(self.params['sys-encoding']))
if img:
self.end_paragraph()
src=os.path.basename(src)
self.out.append(self.tag_repr('image', [('xlink:href','#'+src)], True))
self.binary.add('image/%s' % img['type'], src, img['data'])
def report_unbalanced(self, tag):
''' Handle unbalansed close tags'''
self.handle_data('%s>\n' % tag)
def unknown_charref(self, ref):
''' Handle unknown char refs '''
# FIX: Don't know, how to handle it
self.msg('Unknown/invalid char ref %s is being ignored\n' % ref)
raise(Warning, 'Unknown char ref %s is being ignored\n' % ref)
def unknown_entityref(self, ref):
''' Handle unknown entity refs '''
# FIX: Don't know, how to handle it
self.msg('Unknown entity ref %s\n' % ref, 1)
# --- Methods for support parsing
def start_saving(self):
''' Not out data to out but save it '''
self.saving = True
self.save = ''
def stop_saving(self):
''' Stop data saving '''
self.saving = False
return self.save
def end_paragraph(self):
'''
Finalise paragraph
'''
if not self.data.strip():
try:
p = self.nstack[0].index('p')
if self.out[-1] == '' or not self.out[-1]:
if self.params['skip-empty-lines']:
self.out.pop()
else:
self.out[-1] = ""
self.nstack[0]=self.nstack[0][:p]
self.nstack[1]=self.nstack[1][:p]
else:
self.mark_end_tag('p')
except ValueError:
pass
else:
if 'p' not in self.nstack[0]:
self.nstack[0][0:0]='p'
self.nstack[1][0:0]=[None]
self.out.append('
')
self.mark_end_tag('p')
def mark_start_tag(self, tag, attrs=None):
''' Remember open tag and put it to output '''
try:
flag = self._TAGS[tag]
except:
flag = 0
if tag in self.nstack[0]:
self.mark_end_tag(tag)
self.nstack[0].append(tag)
self.nstack[1].append(attrs)
if flag & _TAG_INP:
self.data += self.tag_repr(tag, attrs)
else:
self.out.append(self.tag_repr(tag, attrs))
def mark_end_tag(self, tag):
'''
Close corresponding tags. If tag is not last tag was outed,
close all previously opened tags.
I.e. text -> text
'''
if tag not in self.nstack[0]:
return
while self.nstack[0]:
v = self.nstack[0].pop()
a = self.nstack[1].pop()
try:
flag = self._TAGS[v]
except:
flag = 0
if flag & _TAG_INP:
et=self.tag_repr(v,a)
if self.data.rstrip().endswith(et):
self.data=self.data.rstrip()[:-len(et)]
if v=='a':
try:
self.ids[a[0][1]][1]-=1
except:
pass
else:
self.data += "%s>" % v
else:
self.process_data()
if self.out[-1]=="<%s>" % v:
## duplicate tag detected, often from replacing embedded
inside ...
## FIXME not really sure if this is 100% appropriate, is this ONLY for missing target links?
self.msg("!!!!\n")
self.msg('DEBUG** ' + repr(v) + ' --- ' + repr(self.out[-10:]) + ' ' + repr(v), 1)
self.out.pop()
else:
self.out.append("%s>" % v)
if tag == v:
break
def process_data(self):
'''
Handle accomulated data when close paragraph.
'''
if not self.data.strip():
return
if not self.nofill:
self.data=_CH_FLOW+self.data.strip()
self.out.append(self.data)
else:
self.data = self.process_pre(self.data)
self.data = self.detect_headers(self.data)
try:
if self.data[0]=='
' and self.out[-1]=='':
self.out.pop()
self.data = self.data[1:]
msg("WoW! I must be impossible!!!")
except IndexError:
pass
self.out.extend(self.data)
self.data = ''
# --- Parsed data processing methods
def pre_process(self, data):
data=data.replace(u'\x0e',u'
').replace(u'\x0f',u'
')
for i in u'\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x10\x11'\
'\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f':
data = data.replace(i,u' ')
return data
def post_process(self):
'''
Last processing method
'''
id = ''
for i in range(len(self.out)):
if not self.out[i]:
pass # skip empty lines (can apper below, where ...out[i+1]='')
elif self.out[i][0] != '<':
id, p = self.process_paragraph(self.out[i], id)
if p:
if id:
self.out[i-1] = '<%s id="%s">' % (self.out[i-1][1:-1],id)
id = ''
self.out[i] = p
else:
self.out[i-1]=self.out[i]=self.out[i+1]='' # remove empty paragraph
elif id:
try:
if self._TAGS[self.out[i][1:-1]] & _TAG_ID:
self.out[i] = '<%s id="%s">' % (self.out[i][1:-1],id)
id = ''
except KeyError:
pass
#join with dont use newline in and tags
out_tmp = ''
for i in xrange(len(self.out)):
is_new_line = True
if i >= (len(self.out) -1 ):
is_new_line = False
elif self.out[i].startswith('=0]:
self.out=re.sub("%s(.*?)%s" % (self.tag_repr('a',[('xlink:href','#'+i)]),'') ,r'\1',self.out)
sect=self.out.find('').split())
self.out=self.out[sect:]
else:
if self.out.startswith(''):
self.out=self.out[len('
'):]
else:
self.out='\n'+self.out
self.out+='\n'
self.out=_RE_EL.sub(r'\1',self.out)
if self.out.startswith(''):
title = 'title' in self.descr and self.descr['title'] or ''
if title:
self.out = '' + '' + title + '' + self.out[len(''):]
def detect_headers(self, data):
'''
Find headers in plain text.
'''
if not self.params['detect-headers']:
return [data]
res = []
pstart = i = 0
header = ['',
'',
'',
'',
'',
'', # place for title (5)
'
',
'',
'']
while i < len(data)-1:
empty0 = not data[i]
try:
empty1 = not data[i+1]
empty2 = not data[i+2]
empty3 = not data[i+3]
except IndexError:
empty1 = empty2 = empty3 = False
if empty0 and empty1 and not empty2 and empty3:
res.append(data[pstart:i])
header[5]=_CH_FLOW + data[i+1].strip()
res.extend(header)
i+=2
pstart = i+2
else:
istitle = (
empty0 and
not empty1 and
empty2 and
(
empty3 or
data[i+1].startswith(' '*8) or
data[i+1].isupper() or
(
data[i+1].lstrip()[0] not in _DIAL_START and
data[i+1][-1] not in _SENTENCE_FIN
) or
data[i+1].lstrip()[0] in _HEAD_CHARS or
self.is_roman_number(data[i+1])
)
)
istitle = istitle or \
not empty1 and \
self.header_re and \
self.header_re.match(data[i+1])
if istitle:
res.append(data[pstart:i])
header[5]=_CH_FLOW + data[i+1].strip()
res.extend(header)
i+=1
while i < len(data)-1 and not data[i+1]:
i+=1
pstart = i+1
i+=1
if pstart < len(data):
res.append(data[pstart:])
return res
def detect_epigraphs(self):
'''
Detect epigraphs (in plain text)
'''
if not self.params['detect-epigraphs']:
return
sect_found = 0
i = 0
while i < len(self.out):
if type(self.out[i]) != ListType:
if self.out[i] == '':
sect_found = 1
elif self.out[i] == '':
sect_found = sect_found and 2 or 0
elif self.out[i] == '':
sect_found = sect_found and 1 or 0
elif self.out[i][0] != '<':
sect_found = sect_found!=1 and 2 or 0
else:
if sect_found == 1:
res = []
raw = self.out[i]
lraw = len(raw)
j=0
eplines = 0
epfound = 0
while j < len(raw):
while j < lraw and not raw[j]:
j+=1 # skip empty lines
eep = -1
# search empty line
for k in range(j,j+60):
if k >= lraw or not raw[k]:
eep = k
break
if eep == j:
break
if eep >= 0:
eplines = 0
for k in range(j,eep):
rawk = raw[k].lstrip()
if ' '*10 in raw[k] or len(rawk) < 60:
eplines +=1
if len(rawk) > 60:
eplines -= 5
if rawk and (
rawk[0] in _DIAL_START or
rawk[0].isdigit() and
len(rawk)>2 and
rawk[1] in _DIAL_START2
):
eplines -= 1
if (float(eplines)/(eep-j)>0.8):
epfound += 1
author = eep-j > 1
res.extend(['','',raw[j:eep-author]])
if author and self.clean_str(raw[eep-1]).lstrip()[0].isupper():
res.extend(['
','',_CH_FLOW + self.clean_str(raw[eep-1]).lstrip(),''])
else:
if author:
res[-1].append(raw[eep-1])
res.append('
')
res.append('')
j=eep
else:
eep = -1
if eep < 0:
break
j+=1
if epfound:
istart=i
iend=i+1
if i and self.out[i-1] == '':
istart-=1
if j < len(raw)-1:
res.extend(['
',raw[j:]])
elif i < len(self.out)-1 and self.out[i+1] == '
':
iend+=1
self.out[istart:iend] = res
i = istart + len(res)-1
sect_found = 0
i += 1
def detect_verses(self):
'''
Detect verses in plain text
'''
if not self.params['detect-verses']:
return
i=0
while i < len(self.out):
if type(self.out[i]) == ListType:
res=[]
raw=[self.clean_str(x).rstrip() for x in self.out[i]]
lraw=len(raw)
pfound = jstart = j = 0
while j < lraw-3:
if raw[j] and len(raw[j]) < 60 and \
raw[j+1] and len(raw[j+1]) < 80 and \
raw[j+2] and len(raw[j+2]) < 80 and \
raw[j+3] and len(raw[j+3]) < 80:
fl = len(raw[j])
k = j
while k < lraw:
if raw[k].strip() and (
abs(len(raw[k])-fl) > 15 or \
raw[k].lstrip()[0] in _DIAL_START
):
break
k += 1
if k - j > 3:
pfound += 1
if jstart:
res.append('')
if jstart != j:
res.extend([self.out[i][jstart:j],'
'])
res.extend(['',''])
for l in range(j,k):
if raw[l]:
res.extend(['',raw[l].lstrip(),''])
elif l < k-1 and res[-1] != '':
res.extend(['',''])
res.extend(['',''])
j=k-1
jstart = k
j+=1
if pfound:
if jstart < lraw-1:
res.extend(['',self.out[i][jstart:]])
istart = i
iend = i+1
try:
if res[0] == '' and self.out[i-1] == '':
istart -= 1
except:
pass
try:
if res[-1] == '
' and self.out[i+1] == '
':
iend += 1
except:
pass
self.out[istart:iend]=res
i+=1
def detect_paragraphs(self):
'''
Detect paragraphs in plain text
'''
i=0
while i < len(self.out):
if type(self.out[i]) == ListType:
res = []
raw = self.out[i]
j = 0
pfound = 0
while j < len(raw) and not raw[j]:
j+=1
jstart = j
while j < len(raw):
if not raw[j]:
try:
while not raw[j]:
j+=1
except IndexError:
break
if not self.params['skip-empty-lines']:
res.append('')
jstart=j
continue
elif self.asline or \
not self.params['detect-paragraphs'] or \
j >= len(raw)-1 or \
not raw[j+1].lstrip() or \
(raw[j+1].lstrip()[0] in _DIAL_START or \
raw[j+1].startswith(' ')
) and raw[j][-1] in _SENTENCE_FIN:
pfound += 1
res.extend(['',_CH_FLOW + '\n'.join(raw[jstart:j+1]),'
'])
#res.extend(['',_CH_FLOW + ''.join(raw[jstart:j+1]),'
'])
jstart = j+1
j+=1
if pfound > 0:
self.out[i:i+1]=res[1:-1]
i+=len(res)-2
else:
self.out[i]='\n'.join(raw).lstrip()
i+=1
def detect_italic(self, text, arg):
signs='_.,!?:'
istart=-1
res=''
while True:
istart = text.find('_')
if istart >= 0:
iend = sys.maxint
for i in signs:
try:
iend=min(iend, text.index(i, istart+1))
except:
pass
if iend == sys.maxint:
iend=0
if text[istart+1:iend or None]:
emp = ''+ \
text[istart+1:iend or None]+ \
''
else:
emp = ''
text = text[:istart]+ \
emp+ \
(iend and text[iend+(text[iend]=='_'):] or '')
else:
break
return text
def detect_notes(self, text, arg):
while True:
snote = text.find(arg[0])
enote = text.find(arg[1])
if snote <0 or enote <= snote:
break
self.notes.append((self.nextid, text[snote+1:enote]))
text=(text[:snote] +
'' % self.nextid +
"note %s" % self.nextid+"" +
text[enote+1:]
)
self.nextid += 1
return text
def process_pre(self, data):
'''
Process preformatted data (data between and
tag or plain text file)
Determine text format.
'''
data = [x.rstrip() for x in data.splitlines()]
if type(self.asline) == TupleType:
count,G80,L80 = self.asline
for i in data:
if len(i) > 80:
G80+=1
else:
L80+=1
count+=1
if count > 2000:
self.asline = G80 > L80
break
if type(self.asline) == TupleType:
self.asline=(count, G80, L80)
return data
def process_paragraph(self, paragraph, id):
'''
Process paragraph. Find id, normalize quotes.
'''
paragraph=paragraph.strip()
startp = paragraph.find('= 0:
endp=paragraph.index('>',startp+4) # if '>' will not be found exception will raised, because use index
found_id = paragraph[startp+4:endp]
if not id:
id = found_id
else:
self.ids[found_id][0]=0
paragraph=paragraph[:startp]+paragraph[endp+1:]
startp = paragraph.find('')
paragraph=paragraph[:endp+1]+paragraph[endp+1:].lstrip()
if self.params['convert-quotes']:
# process quotes
paragraph = _RE_LQUOTES.sub(_CH_LEFT_Q, paragraph)
paragraph = _RE_RQUOTES.sub(_CH_RIGHT_Q, paragraph)
paragraph = _RE_LQUOTES2.sub(_CH_LEFT_Q, paragraph)
paragraph = _RE_RQUOTES2.sub(_CH_RIGHT_Q, paragraph)
if self.params['detect-notes']:
paragraph = self.process_nontags(paragraph, self.detect_notes, "[]")
paragraph = self.process_nontags(paragraph, self.detect_notes, "{}")
if self.params['detect-italic']:
paragraph = self.process_nontags(paragraph, self.detect_italic, None)
paragraph = ' '.join(paragraph.split()) # Remove extra whitespaces
return [id, paragraph]
def process_nontags(self, text, func, arg):
ss=0
res = ''
w = ''
while 0 <= ss < len(text):
try:
i = text.index('<',ss)
w=text[ss:i]
ss = i
except:
w=text[ss:]
ss = -1
if w.strip():
# process text between tagtext if any.
res += func(w, arg)
else:
res += ' '
if ss >= 0:
i = text.index('>',ss)
res+=text[ss:i+1]
ss=i+1
return res
# --- Make out document methods
def make_description(self, descrs):
if self.params['descr'].selfdetect:
titles = []
authors = []
for descr in descrs:
title = 'title' in descr and descr['title'] or ''
author = 'author' in descr and descr['author'] or ''
if not author and '.' in title :
point = title.index('.')
author = title[:point].strip()
title = title[point+1:].strip()
author = author.split()
a ={
'first': author and author[0] or '',
'middle': len(author) > 2 and author[1] or '',
'last': len(author) > 2 and author[2] or (len(author) > 1 and author[1] or '')
}
if a not in authors:
authors.append(a)
if title not in titles:
titles.append(title)
title = ' ||| '.join(titles)
genre = self.params['descr'].genre
else:
title = self.params['descr'].title
authors = self.params['descr'].authors
genre = self.params['descr'].genre
annotations = []
for descr in descrs:
if descr.get('annot', ''):
annotations.append(descr.get('annot', ''))
self.rez_descr.authors = authors
self.rez_descr.title = title
self.rez_descr.genre = genre
auth_str = ''
for author in authors:
auth_str += ""
auth_str += "%s" % author['first']
auth_str += "%s" % author['middle']
auth_str += "%s" % author['last']
auth_str += "\n"
retv = '\n'
#fill title-info
retv += '\n'
retv += '%s\n' % genre
retv += auth_str
retv += '%s\n' % title
#if annotations:
# retv+='%s\n' % '\n\n'.join(annotations)
retv += '%s\n' % self.params['descr'].lang
self.rez_descr.lang = self.params['descr'].lang
retv += '\n'
#fill document-info
retv += '\n'
retv += '\n'
if self.params['descr'].program_info != None:
retv += '%s\n' % self.params['descr'].program_info
#retv += '%s\n' % (time.strftime('%Y-%m-%d'), time.strftime('%d %B %Y'))
retv += '%s\n' % (time.strftime('%Y-%m-%d'), time.strftime('%Y-%m-%d'))
if self.params['descr'].urls:
retv += '%s\n' % ' '.join(self.params['descr'].urls)
retv += '%s\n' % self.params['descr'].id
retv += '1.0\n'
retv += '\n'
retv += '\n'
return retv
def make_notes(self):
if not self.notes:
return ''
retv=['' %
(x,x,y) for x,y in self.notes]
return 'Notes
'+''.join(retv)+''
# --- Auxiliary methods
def tag_repr(self, tag, attrs, single=False):
''' Start tag representation '''
closer=single and '/' or ''
if attrs:
return "<%s %s%s>" % (tag, ' '.join(['%s="%s"' % x for x in attrs if x[1] is not None]),closer)
else:
return "<%s%s>" % (tag, closer)
def clean_str(self, intext):
''' Remove simple tags from line. '''
return _RE_TAG.sub('',intext)
def is_roman_number(self, instr):
'''
Detect - is instr is roman number
'''
instr = self.clean_str(instr).strip()
if len(instr)>8:
return False
return bool(_RE_ROMAN.match(instr))
def msg(self, msg, level=0):
if self.informer and self.params['verbose'] > level:
self.informer(msg)
def make_id(self, id):
'''
Make properly link id
'''
# FIX: make id later
return id
def print_out(self, data=None):
if data is None:
data = self.out
for i in data:
if type(i) == ListType:
print '['
for j in i:
print j.encode('koi8-r','replace')
print ']'
else:
print i.encode('koi8-r','replace')
def convert_image(self, filename):
src = os.path.join(self.source_directoryname, filename)
try:
im = Image.open(src)
except IOError:
return None
if im.format == 'GIF':
f = cStringIO.StringIO()
im.save(f, "PNG")
return {'data':f.getvalue(), 'type':'png'}
elif im.format == 'PNG':
try:
data = open(src, 'rb').read()
except IOError:
return None
else:
return {'data':data, 'type':'png'}
elif im.format == 'JPEG':
try:
data = open(src, 'rb').read()
except IOError:
return None
else:
return {'data':data, 'type':'jpeg'}
return None
def get_descr(self):
return self.rez_descr
# FIXME TODO, the try block is broken
locale.setlocale(locale.LC_ALL, '')
try:
sys_encoding = locale.nl_langinfo(locale.CODESET)
except AttributeError:
sys_encoding = "Windows-1251"
default_params = {
'file-name' : '', # Input HTML file name, even if not reading from an on-disk file, this should be passed in to aid in href detection
'data' : '', # Data for processing
'encoding-from' : '', # Source data encoding
'encoding-to' : 'Windows-1251', # Result data encoding
'convert-quotes' : 1, # Convert "" to << >>
'convert-hyphen' : 1, # Convert - to ndash
'header-re' : '', # regexp for detecting section headers
'skip-images' : 0, # Ignore images (not include it to result)
'skip-ext-links' : 0, # Ignore external links
'skip-empty-lines' : 1, # Not generate tags
'detect-italic' : 1, # Detect italc (_italic text here_)
'detect-headers' : 1, # Detect sections headers
'detect-epigraphs' : 1, # Detect epigraphs
'detect-paragraphs' : 1, # Detect paragraphs
'detect-annot' : 1, # Detect annotation
'detect-verses' : 1, # Detect verses
'detect-notes' : 1, # Detect notes ([note here] or {note here})
'verbose' : 1, # Verbose level
'sys-encoding': sys_encoding,
'informer': sys.stderr.write,
'convert-span-to': None, # what to convert span tags to, if set to 'em' or 'emphasis' converts spans to 'emphasis', if 'strong' converts to 'strong', anything else is ignored/skipped/removed (silently)
'descr': None
}