" not in l.lower()): r = re.compile(r"<title>\s*(.*)\s*

#!/usr/bin/env python # -*- coding: latin-1 -*- ###################################### # makeArXivIndex # # Parcours les sous-répertoires du # # répertoire courant # # et crée une page HTML # # avec la liste des articles arXiv # # (c) Olivier Pirson ---- DragonSoft # # http://www.opimedia.be/DS/ ####################### # Débuté le 1er novembre 2007 # # v.01.00 --- 22 novembre 2007 # # v.01.01 --- 19 mai 2008 # # v.01.02 --- 9 juillet 2008 # # v.01.03 --- 2 août 2008 (ignore makeArXivIndex.py et .sh # # et % déplacé) # # v.01.04 --- 12 novembre 2008 (affiche "progression") # # v.01.05 --- 13 mars 2009 (target="_blank") # # v.01.06 --- 27 septembre 2009 : nouveau site web # # --- 15 mars 2010 : nouveau site web # # --- 2 janvier 2012 : nouveau site web # ############################################################ VERSION = "v.01.06 --- 2012 January 2012" import glob, os, re, sys ############# # Constante # ############# # Flèche HTML vers la droite RARR = ' →' ############# # Variables # ############# nberrors = 0 # nombre d'erreurs/avertissements verbose = False # Affiche des infos supplémentaires sur la sortie des erreurs si == True ############# # Fonctions # ############# def buildArxivDir(dname): """Construit et renvoie un couple (identifiant, ligne HTML) pre: dname: chemin d'accès d'un répertoire accessible result: (key, string, bool)""" htmlmain = None # principal fichier HTML arXiv html = [] # autres fichiers HTML pdf = [] # fichiers PDF ps = [] # fichiers PS dvi = [] # fichiers DVI tex = [] # fichiers TeX others = [] # autres fichiers infosmain = None # informations extraites de htmlmain readed = False # document lu ou pas (présence d'un fichier "readed.txt" ou pas) # Parcourt les éléments de dname for f in pathlist(dname): if re.compile("\.html?$", re.I).search(f): checktypefile(f, "html") if htmlmain == None: infos = extractArXivInfo(f) if infos["identifier"] != None: # probablement un fichier HTML arXiv htmlmain = f infosmain = infos f = None if f != None: # autres fichiers HTML html.append(f) elif re.compile("\.pdf$", re.I).search(f): checktypefile(f, "%PDF") pdf.append(f) elif re.compile("\.ps$", re.I).search(f): checktypefile(f, "%!PS") ps.append(f) elif re.compile("\.dvi$", re.I).search(f): checktypefile(f, "TeX output") dvi.append(f) elif re.compile("\.tex$", re.I).search(f): tex.append(f) else: others.append(f) ## Construction de la ligne HTML htmlline = [] # Fichier HTML arXiv principal arXivHTMLfounded = (infosmain != None) # fichier HTML arXiv trouvé ou pas dans ce répertoire if not arXivHTMLfounded: print_to_err("'%s/' don't contain HTML arXiv file" % dname) htmlline.append('%s/' % dname) else: # Numéro arXiv if infosmain["href"] != None: htmlline.append( '%s' % (infosmain["href"], infosmain["identifier"])) else: htmlline.append('%s' % infosmain["identifier"]) # Lien vers le fichier htmlmain if infosmain["title"] != None: htmlline.append('%s' % (htmlmain, tex_to_html(infosmain["title"]))) elif infosmain["htmltitle"] != None: htmlline.append('%s' % (htmlmain, tex_to_html(infosmain["htmltitle"]))) else: htmlline.append('%s' % (htmlmain, htmlmain)) # Lien vers le fichier PDF principal for i in range(len(pdf)): f = pdf[i] if os.path.splitext(os.path.basename(f))[0] == infosmain["identifier"]: htmlline.append(RARR) htmlline.append('PDF' % pdf[i]) del pdf[i] break # Auteur(s) l = [re.compile("[^a-zA-Z]*([a-zA-Z]+)[^a-zA-Z]*$").sub(r' \1', a) for a in infosmain["authors"]] if l != []: htmlline.append(' (%s)' % ", ".join(l)) # Catégorie if infosmain["category"] != None: htmlline.append(' %s' % infosmain["category"]) # Date if infosmain["dateline"] != None: htmlline.append(' %s' % infosmain["dateline"]) # Version if infosmain["version"] != None: htmlline.append(' %s' % infosmain["version"]) if (htmlline != []) or (html != []) or (pdf != []) or (ps != []) or (dvi != []) or (tex != []) or (others != []): htmlline[-1] += " :" # Liens vers les autres fichier HTML l = ['%s' % (f, re.compile(r"\.html?", re.I).sub("", os.path.basename(f))) for f in html] if l != []: htmlline.append(", ".join(l)) # Liens vers fichiers PDF l = ['PDF' % f for f in pdf] if l != []: htmlline.append(RARR) htmlline.append(", ".join(l)) # Liens vers fichiers PS l = ['PS' % f for f in ps] if l != []: htmlline.append(RARR) htmlline.append(", ".join(l)) # Liens vers fichiers DVI l = ['DVI' % f for f in dvi] if l != []: htmlline.append(RARR) htmlline.append(", ".join(l)) # Liens vers fichiers (La)TeX l = ['TeX' % f for f in tex] if l != []: htmlline.append(RARR) htmlline.append(", ".join(l)) # Liens restants l = [] for o in others: if os.path.basename(o) == "readed.txt": readed = True if os.path.isdir(o): l.append('%s/' % (o, os.path.basename(o))) elif (os.path.basename(o)[:3]).lower() == "see": l.append('%s' % (o, os.path.basename(o))) else: l.append('%s' % (o, os.path.basename(o))) if l != []: htmlline.append(RARR) htmlline.append(", ".join(l)) # Readed ? if readed: htmlline.insert(1, '%') htmlline.append("
") if (infosmain == None) or (infosmain["identifier"] == None): return (dname, " ".join(htmlline), arXivHTMLfounded) else: return (infosmain["identifier"], " ".join(htmlline), arXivHTMLfounded) def checktypefile(fname, s): """Indique sur la sortie des erreurs si la 1ère ligne du fichier fname ne contient pas la chaîne s pre: fname: nom de fichier accessible en lecture s: chaîne de caractères""" s = s.lower() f = open(fname, "r") if not(s in f.next().lower()): print_to_err("'%s' not %s" % (fname, s)) f.close() def extractArXivInfo(fname): """Renvoie un dictionnaire avec les informations extraites du fichier fname pre: fname: nom de fichier accessible en lecture result: dictionnaire["authors"]: liste de string ["category"]: string ou None ["dateline"]: string ou None ["htmltitle"]: string ou None ["href"]: string ou None ["identifier"]: string ou None ["title"]: string ou None ["version"]: string ou None""" d = {} # dictionnaire des informations d["authors"] = [] d["category"] = None d["dateline"] = None d["htmltitle"] = None d["href"] = None d["identifier"] = None d["title"] = None d["version"] = None cat = None # category autrement spécifié cat2 = None prev = "" # ligne précédente f = open(fname, "r") for l in f: l = l[:-1] # ligne courante sans le caractère de fin de ligne t = prev + l # ligne précédente suivie de la ligne courante (pour extraire les infos qui sont sur 2 lignes) if d["identifier"] == None: r = re.compile(r'dc:identifier="http://.*?arxiv.org/abs/(\D*\d*\.?\d{4})"', re.I).search(t) if r != None: d["identifier"] = r.group(1) if d["category"] == None: r = re.compile(r'\s*(.+?)\s*', re.I).search(t) if r != None: d["category"] = r.group(1) if (cat == None) and (d["category"] == None): r = re.compile(r'(?:catego|subject).*?>\s*(.+?)\s*<', re.I).search(t) if r != None: cat = r.group(1) if (cat2 == None) and (d["identifier"] != None): r = re.compile('PDF' % d["identifier"], re.I).search(t) if r != None: cat2 = r.group(1) if (d["htmltitle"] == None) and ("" not in l.lower()): r = re.compile(r"<title>\s*(.*)\s*", re.I).search(t) if r != None: d["htmltitle"] = r.group(1) if d["href"] == None: r = re.compile(r'dc:identifier="(http://.*?arxiv.org/abs/\D*\d*\.?\d{4})"', re.I).search(t) if r != None: d["href"] = r.group(1) if (d["title"] == None) and ('dc:title="' not in l.lower()): r = re.compile(r'dc:title="\s*(.+)\s*"\s*trackback:', re.I).search(t) if r == None: r = re.compile(r'dc:title="\s*(.+)\s*"?', re.I).search(t) if r != None: d["title"] = r.group(1) r = re.compile(r"date.+(\d{1,2}\s+\w+\s+\d{4})", re.I).search(t) if r != None: r = re.compile(r"(\d{1,2}\s+\w+\s+\d{4})").findall(t) if r != []: d["dateline"] = r[-1] # dernière date rencontrée r = re.compile(r"(v\d+)").findall(t) if r != []: d["version"] = r[-1] # dernière version rencontrée (avec la date) r = re.compile(r'(.+?)', re.I).search(l) if r != None: d["authors"].append(r.group(1)) prev = l f.close() if d["category"] == None: d["category"] = cat if cat != None else cat2 return d def help_msg(): """Affiche le message d'aide sur la sortie des erreurs puis termine le programme par un code d'erreur 1""" print >>sys.stderr, """makeArXivIndex Print on standard output a HTML index of subdirectories with arXiv articles. (c) Olivier Pirson --- DragonSoft --- http://www.opimedia.be/DS/ %s""" % VERSION sys.exit(1) def pathlist(path): """Renvoie la liste des éléments du chemin d'accès path (chaque répertoire '*_fichiers' associé à un fichier '*.htm[l]' est oublié) pre: path: chemin d'accès accessible en lecture result: liste de fichiers et/ou de répertoires""" l = [f.replace("\\", "/") for f in glob.glob("%s/*" % os.path.normpath(path))] l.sort(key=str.lower) l_sub = [] # liste des répertoires '*_fichiers' correpondant à un fichiers '*.htm[l]' for f in l: if os.path.isdir(f) and (f[-9:] == "_fichiers"): fn = f.lower()[:-9] if os.path.isfile(fn + ".htm") or os.path.isfile(fn + ".html"): l_sub.append(f) # Supprime de l ces répertoires '*_fichiers' for f in l_sub: l.remove(f) return l def print_to_err(s): """Envoie s sur la sortie des erreurs et incrémente nberrors pre: s: quelconque""" global nberrors print >>sys.stderr, "! %s" % s sys.stderr.flush() nberrors += 1 def tex_to_html(s): """Renvoie le string s après conversion des "caractères TeX" en caractères HTML result: string""" # Accents s = re.compile(r"\\'\{(.)\}").sub(r"&\1acute;", re.compile(r"\\`\{(.)\}").sub(r"&\1grave;", re.compile(r"\\^\{(.)\}").sub(r"&\1circ;", s))) s = re.compile(r"\\'(.)").sub(r"&\1acute;", re.compile(r"\\`(.)").sub(r"&\1grave;", re.compile(r"\\^(.)").sub(r"&\1circ;", s))) # Lettres grecques s = re.compile(r"\\(alpha|beta|gamme|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi" + "|omicron|pi|rho|sigma|tau|upsilon|phi|khi|psi|omega)", re.I).sub(r"&\1;", s) return s ######## # Main # ######## if __name__ == "__main__": if len(sys.argv) >= 2: help_msg() if os.sep != os.path.normpath("\\"): print_to_err("path separator problem") # Début HTML print """ arXiv index """ % VERSION sys.stdout.flush() nbdirs = 0 # nombre de sous-répertoires nbdirsnotarxiv = 0 # nombre de sous-répertoires ne contenant pas de fichier HTML arXiv nb = 0 # nombre d'articles arXiv nbtwins = 0 # nombre de numéros "doublons" sfiles = [] # liste des strings pour chaque fichier non HTML rencontré shtml = [] # liste des strings pour chaque fichier HTML rencontré sdirs = {} # dictionnaire des strings pour chaque répertoire rencontré old_c = None # premier caractère du nom de fichier précédent # Pour chaque élément f du chemin courant for f in pathlist("./"): f = os.path.normpath(f).replace("\\", "/") if verbose: print >>sys.stderr, f if os.path.isdir(f): # f est un répertoire if not(verbose) and (f[0] != old_c): print >>sys.stderr, f old_c = f[0] nbdirs += 1 t = buildArxivDir(f) if t[0] in sdirs: # mêmes "références" déjà rencontrées print_to_err("'%s' is duplicated '%s' arXiv file" % (f, t[0])) nbtwins += 1 else: # ok, "références" pas encore rencontrées sdirs[t[0]] = t[1] if t[2]: nb += 1 else: nbdirsnotarxiv += 1 else: # autre if (f[-17:] == "makeArXivIndex.py") or (f[-17:] == "makeArXivIndex.sh"): continue fn = re.compile(r"\.html?", re.I).sub("", f) # Enlève l'éventuelle extension HTML if fn == f: # fichier d'extension non HTML sfiles.append('%s
' % (f, fn)) else: # fichier d'extension HTML htmltitle = extractArXivInfo(f)["htmltitle"] if htmltitle != "": shtml.append('%s
' % (f, f)) else: shtml.append(('%s' + ' %s
') % (f, htmltitle, f)) shtml.sort(key=str.lower) for s in shtml: print s if shtml != []: print "

\n" sfiles.sort(key=str.lower) for s in sfiles: print s if sfiles != []: print "

\n" ks = sdirs.keys() ks.sort(key=str.lower) for k in ks: print sdirs[k] # Fin HTML print """ """ # Fin print >>sys.stderr print >>sys.stderr, "%u arXiv articles in %u subdirectories" % (nb, nbdirs) if nbdirsnotarxiv > 0: print >>sys.stderr, " ! %u subdirectories without HTML arXiv file" % nbdirsnotarxiv if nbtwins > 0: print >>sys.stderr, " ! %u duplicated files" % nbtwins if nberrors > 0: print >>sys.stderr, " ! %u errors/warnings" % nberrors