treeprint.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104

#!/usr/bin/env python3 
 
import sys
import re
try:
    import unicodedata
except:
    pass
 
"""
This program slurps in a .XCompose file on standard input (or several
concatenated together, since it won't follow includes) and outputs the
compose sequences in an S-expression-like syntax, showing the prefix tree
of sequences.  This should bring together some of the groups that use a
prefix-character, like * for the Greek alphabet and # for musical symbols.
And scatter other related things far and wide.  But it might be fun to look
at.
 
Prefix conflicts (in which you have a compose sequence that is the proper
prefix of another) and exact conflicts (in which you have a compose
sequence listed two with two different translations) cannot be handled
gracefully in this notation, and they are not handled gracefully by this
program.  The tie is not broken in a consistent or predictable fashion,
etc: this is a case of GIGO.  Deal with it.
"""
 
def showdict(data, indent):
    first=True
    for key in sorted(data.keys()):
        value=data[key]
        if first:
            first=False
        else:
            print()
        print(" "*max(indent,0) + "("+key, end=" ")
        # Sneaky trick: we don't want to go newline-indent over and 
        # over for long sequences, i.e. cases where there is only 
        # one possible follower.  So we skip the newlines in those 
        # cases, and tell the next-lower iteration not to do the whole 
        # indent thing by passing a negative indent.  We don't just 
        # pass 0 or 1 because if another iteration *further down* 
        # turns out not to be an only case, it will need to know 
        # the right indent to pass along.  So a case like  
        # R-O-{CK|LL}, the O is unique after the R, so no linefeed, 
        # but then the {C|L} are not unique after the O. 
        if type(value)==dict:
            if len(value)>1:
                print()
                showdict(value, abs(indent)+4),
            else:
                showdict(value, -abs(indent+4)),
        else:
            print("    "+value, end=" ")
            if "-n" in sys.argv:
                try:
                    print(unicodedata.name(value.decode('utf-8')),end=" ")
                except:
                    pass
        print(")",end=" ")
 
listing={}
 
try:
    while True:
        line=sys.stdin.__next__()
        startpos=0
        name=[]
        dupsfound=[]
        while True:
            m=re.match("\s*<(\w+)>",line[startpos:])
            if not m:
                break
            word=m.group(1)
            name.append(word)
            startpos+=m.end()
        if startpos<=0:
            continue
        m=re.match(r'[^"]*"(.+?)"',line)
        if not m:
            # shouldn't happen, but just in case 
            val='???'
            print("couldn't make sense of line: "+line)
        else:
            val=m.group(1)
        cur=listing
        for elt in name[:-1]:
            if type(cur)==dict:
                if not elt in cur:
                    cur[elt]={}
                cur=cur[elt]        # This will fail for prefix conflicts 
            else:
                break           # prefix conflict 
        # Presumably by now we're at the end, pointing to an empty dict. 
        if type(cur)==dict:
            cur[name[-1]]=val
        else:
            # fail.  Prefix conflict.  Let's ignore it. 
            pass
except StopIteration:
    print("hit end")
 
showdict(listing,0)