polyglotstats

word frequency lists for different languages
Log | Files | Refs | README

wikipedia_word_count.py (2973B)


      1 import bz2
      2 import contextlib
      3 import dataclasses
      4 import hashlib
      5 import sys
      6 from collections import Counter
      7 from pathlib import Path
      8 
      9 import mwxml
     10 import mwparserfromhell
     11 import stanza
     12 import torch
     13 
     14 
     15 LANG = 'hr'
     16 
     17 
     18 # limit number of threads, so that the server doesn't have 100% load
     19 torch.set_num_threads(4)
     20 pipeline = stanza.Pipeline(LANG, processors='tokenize,pos,lemma')
     21 
     22 
     23 @dataclasses.dataclass
     24 class TextPage:
     25     page_id: int
     26     title: str
     27     text: str
     28 
     29 
     30 def count_text_words(text: str) -> Counter:
     31     c = Counter()
     32 
     33     doc = pipeline(text)
     34 
     35     for sentence in doc.sentences:
     36         for token in sentence.tokens:
     37             token_data = token.to_dict()[0]
     38 
     39             if (
     40                 'lemma' in token_data
     41                 and "upos" in token_data
     42                 and token_data["upos"].lower() not in ["punct", "sym"]
     43             ):
     44                 c[token_data['lemma']] += 1
     45 
     46     return c
     47 
     48 
     49 def write_counts_summary(article_id: int, counts: Counter):
     50     target_path = Path('wiki_count') / f'{LANG}wiki' / str(article_id)
     51     target_path.parent.mkdir(parents=True, exist_ok=True)
     52 
     53     total = counts.total()
     54 
     55     with open(target_path, 'wt', encoding='utf-8') as f_out:
     56         for item, count in sorted(counts.items(), key=lambda x: x[1], reverse=True):
     57             item = item.replace('\t', ' ')
     58             percentage = count / total
     59             f_out.write(f'{item}\t{count}\t{percentage}\n')
     60 
     61 
     62 @contextlib.contextmanager
     63 def open_wiki_dump(path):
     64     if path.endswith('bz2'):
     65         with bz2.open(path) as f_bzip:
     66             yield f_bzip
     67     else:
     68         with open(path) as f_wiki:
     69             yield f_wiki
     70 
     71 
     72 def main():
     73     wiki_file = sys.argv[1]
     74 
     75     with open_wiki_dump(wiki_file) as f:
     76         dump = mwxml.Dump.from_file(f)
     77 
     78         for page in dump:
     79             cnt = 0
     80             for revision in page:
     81                 cnt += 1
     82                 parsed = mwparserfromhell.parse(revision.text)
     83 
     84             if cnt != 1:
     85                 raise RuntimeError('Found more than one revision')
     86 
     87             text = TextPage(
     88                 page_id=page.id,
     89                 title=page.title,
     90                 text=parsed.strip_code(),
     91             )
     92 
     93             print(f'{text.page_id} - {text.title}')
     94 
     95             cache_file = Path('wiki_cache') / f'{LANG}' / str(text.page_id)
     96 
     97             cache_hash = None
     98             if cache_file.is_file():
     99                 with open(cache_file, 'rt', encoding='utf-8') as f_cache:
    100                     cache_hash = f_cache.read().strip()
    101 
    102             text_hash = hashlib.sha256(text.text.encode('utf-8')).hexdigest()
    103             if cache_hash == text_hash:
    104                 continue
    105 
    106             counts = count_text_words(text.text)
    107             write_counts_summary(text.page_id, counts)
    108 
    109             cache_file.parent.mkdir(parents=True, exist_ok=True)
    110             with open(cache_file, 'wt', encoding='utf-8') as f_cache:
    111                 f_cache.write(f'{text_hash}\n')
    112 
    113 
    114 if __name__ == '__main__':
    115     main()