File size: 1,143 Bytes
7900c16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import argparse


if __name__ == '__main__':
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--vocab_1", type=str)
    parser.add_argument("--vocab_2", type=str)
    args = parser.parse_args()

    vocab_set_1 = set()
    vocab_set_2 = set()

    with open(args.vocab_1, mode='r', encoding='utf-8') as f:
        for line in f:
            try:
                w = line.strip().split()[0]
                vocab_set_1.add(w)
            except:
                pass

    with open(args.vocab_2, mode='r', encoding='utf-8') as f:
        for line in f:
            try:
                w = line.strip().split()[0]
                vocab_set_2.add(w)
            except:
                pass

    print("vocab_1: " + args.vocab_1 + ", size: " + str(len(vocab_set_1)) )
    print("vocab_2: " + args.vocab_2 + ", size: " + str(len(vocab_set_2)) )

    print("vocab_1 - " + "vocab_2 = " + str(len(vocab_set_1 - vocab_set_2)))
    print("vocab_2 - " + "vocab_1 = " + str(len(vocab_set_2 - vocab_set_1)))
    print("vocab_1 & " + "vocab_2 = " + str(len(vocab_set_1 & vocab_set_2)))