polyglotstats

word frequency lists for different languages
Log | Files | Refs | README

sum_counts.py (949B)


      1 import operator
      2 import timeit
      3 from collections import Counter
      4 from pathlib import Path
      5 
      6 
      7 LANG = 'hr'
      8 
      9 
     10 def read_text_counts(filepath: Path) -> Counter:
     11     c = Counter()
     12 
     13     with open(filepath, 'rt') as f:
     14         for line in f:
     15             word, count, _percentage = line.strip().split('\t')
     16             c[word] += int(count)
     17 
     18     return c
     19 
     20 
     21 def main():
     22     files = list(Path(f'wiki_count/{LANG}wiki/').glob('*'))
     23     total = len(files)
     24 
     25     c = Counter()
     26 
     27     last_time = timeit.default_timer()
     28     for i, file in enumerate(files):
     29         if i % 1000 == 0:
     30             now = timeit.default_timer()
     31             print(f'{i} / {total} ({now - last_time})')
     32             last_time = now
     33 
     34         c.update(read_text_counts(file))
     35 
     36     with open(f'data/{LANG}wiki/all', 'wt') as f:
     37         for word, count in sorted(c.items(), key=operator.itemgetter(1), reverse=True):
     38             f.write(f'{word}\t{count}\n')
     39 
     40 
     41 if __name__ == '__main__':
     42     main()