#!/usr/bin/env python3
# -*- coding: latin-1 -*-
"""
entities.py
  Transform the file 'http://www.w3.org/TR/html401/sgml/entities.html'
  read from standard input to an HTML table on standard output.

GPLv3 --- Copyright (C) 2018 Olivier Pirson
http://www.opimedia.be/

Started the October 6, 2007
v.01.00 --- October 6, 2007
        --- February 20, 2008
        --- May 19, 2008
v.01.01 --- September 28, 2009 : new website
        --- March 15, 2010 : new website
        --- January 2, 2012 : new website
v.01.02 --- June 20, 2012 : Python 3
v.02.00 --- November 23, 2018: cleaned Python style
"""

import re
import sys

VERSION = 'v.02.00 --- November 23, 2018'


#
# Main
######
def main():
    """
    Main
    """
    entities = {}

    # Read list of entities
    complete = ''  # complete line
    for line in sys.stdin:
        complete += line[:-1]
        if re.search(r'--&gt;\s*$', line) is None:  # line is not complete
            continue

        match = re.search(r'!ENTITY\s+(\w+)\s.+\#(\d+);.+?-- (.+)\s*,s*(.+)\s*--&gt;\s*$',
                          complete)
        if match is not None:  # entity find
            character = int(match.group(2))  # entity HTML code

            assert character not in entities  # same code find twice

            name = match.group(1).strip()
            description = re.sub(r'\s+', ' ', match.group(3).strip())
            hexa = match.group(4).strip()

            if name == 'lang':
                description += ' (&amp;#10216; U+027E8 for HTML5)'
            if name == 'rang':
                description += ' (&amp;#10217; U+027E9 for HTML5)'

            entities[character] = (name, description, hexa)
            complete = ''

    print(len(entities), 'entities founded', file=sys.stderr)

    # Print simple entities list
    seq = []
    for character in sorted(entities):
        seq.append('<a href="entities.html#e-{0}" title="&{0}; | &amp;{0}; | &amp;#{1}; | {2}">&{0};</a>'
                   .format(entities[character][0],
                           character,
                           entities[character][1]))

    print('<div class="chars">')
    print('\n'.join(seq))
    print('</div>')

    # Print HTML table
    print("""<table class="entities">
  <thead><tr class="left"><th colspan="2">Caract&egrave;re</th><th colspan="2">Nom / Code <abbr>HTML</abbr></th><th colspan="2">Description</th></tr></thead>
  <tbody>""")

    for character in sorted(entities):
        deprecated = (' class="deprecated"'
                      if entities[character][0] in ('lang', 'rang')
                      else '')
        print('<tr id="e-{0}"><td><span>&{0};</span></td><td>&{0};</td><td>&amp;{0};</td><td{4}>&amp;#{1};</td><td>{2}</td><td>{3}</td></tr>'
              .format(entities[character][0],
                      character,
                      entities[character][1],
                      entities[character][2],
                      deprecated))

    print("""  </tbody>
</table>""")


if __name__ == '__main__':
    main()
