]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Ngrams/Count.hs
[MOCK] More credible count.
[gargantext.git] / src / Gargantext / Ngrams / Count.hs
1 {-# LANGUAGE OverloadedStrings #-}
2
3 module Gargantext.Ngrams.Count where
4
5 import Gargantext.Prelude
6
7
8 import Data.Foldable as F
9
10 import Data.Map.Strict (insertWith)
11 import Data.Map (Map)
12 import qualified Data.Map as M
13
14 --import qualified Data.Text.Lazy.IO as DTLIO
15 import qualified Data.Text.Lazy as DTL
16
17 -- | /O(n)/ Breaks a 'Text' up into each Text list of chars.
18 -- from slower to faster:
19 letters :: DTL.Text -> [DTL.Text]
20 letters text = DTL.chunksOf 1 text
21
22 letters' :: DTL.Text -> [DTL.Text]
23 letters' text = DTL.splitOn "#" $ DTL.intersperse '#' text
24
25 letters'' :: DTL.Text -> [DTL.Text]
26 letters'' = DTL.foldr (\ch xs -> DTL.singleton ch : xs) []
27
28
29 -- words
30 -- lines
31 -- words between punctuation
32 -- number of punctuation
33
34 occurrences :: Ord a => [a] -> Map a Int
35 occurrences xs = foldl' (\x y -> insertWith (+) y 1 x) M.empty xs
36
37 -- for optimization :
38 --occurrences' :: Ord a => [a] -> Map a Integer
39 --occurrences' xs = DTL.foldl (\x y -> M.insertWith' (+) y 1 x) M.empty xs
40
41 --countMain :: IO ()
42 --countMain = do
43 -- (fichier:_) <- getArgs
44 -- c <- DTLIO.readFile fichier
45 -- --print $ occurrences $ DTL.chunksOf 1 c
46 -- pure $ occurrences $ letters'' c
47 -- --print $ occurrences $ DTL.words $ DTL.toLower c
48 --