]> Git — Sourcephile - gargantext.git/blob - src/Data/Gargantext/Ngrams/Count.hs
[FEAT] Ngrams extractor in English with tests : ok. Need to factor pattern matching...
[gargantext.git] / src / Data / Gargantext / Ngrams / Count.hs
1 {-# LANGUAGE OverloadedStrings #-}
2
3 module Data.Gargantext.Ngrams.Count where
4
5 import System.Environment (getArgs)
6
7 import Data.Foldable as F
8
9 import Data.Map (Map)
10 import qualified Data.Map as M
11
12 import qualified Data.Text.Lazy.IO as DTLIO
13 import qualified Data.Text.Lazy as DTL
14
15 -- | /O(n)/ Breaks a 'Text' up into each Text list of chars.
16 -- from slower to faster:
17 letters :: DTL.Text -> [DTL.Text]
18 letters text = DTL.chunksOf 1 text
19
20 letters' :: DTL.Text -> [DTL.Text]
21 letters' text = DTL.splitOn "#" $ DTL.intersperse '#' text
22
23 letters'' :: DTL.Text -> [DTL.Text]
24 letters'' = DTL.foldr (\ch xs -> DTL.singleton ch : xs) []
25
26
27 -- words
28 -- lines
29 -- words between punctuation
30 -- number of punctuation
31
32 occurrences :: Ord a => [a] -> Map a Int
33 occurrences xs = foldl' (\x y -> M.insertWith' (+) y 1 x) M.empty xs
34
35 -- for optimization :
36 --occurrences' :: Ord a => [a] -> Map a Integer
37 --occurrences' xs = DTL.foldl (\x y -> M.insertWith' (+) y 1 x) M.empty xs
38
39 countMain :: IO ()
40 countMain = do
41 (fichier:_) <- getArgs
42 c <- DTLIO.readFile fichier
43 --print $ occurrences $ DTL.chunksOf 1 c
44 print $ occurrences $ letters'' c
45 --print $ occurrences $ DTL.words $ DTL.toLower c
46