]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Metrics/Occurrences.hs
[FEAT] grouping ngrams better written (simplified) with semigroup. TODO: update the...
[gargantext.git] / src / Gargantext / Text / Metrics / Occurrences.hs
1 {-|
2 Module : Gargantext.Text.Metrics.Occurrences
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Token and occurrence
11
12 An occurrence is not necessarily a token. Considering the sentence:
13 "A rose is a rose is a rose". We may equally correctly state that there
14 are eight or three words in the sentence. There are, in fact, three word
15 types in the sentence: "rose", "is" and "a". There are eight word tokens
16 in a token copy of the line. The line itself is a type. There are not
17 eight word types in the line. It contains (as stated) only the three
18 word types, 'a', 'is' and 'rose', each of which is unique. So what do we
19 call what there are eight of? They are occurrences of words. There are
20 three occurrences of the word type 'a', two of 'is' and three of 'rose'.
21 Source : https://en.wikipedia.org/wiki/Type%E2%80%93token_distinction#Occurrences
22
23 -}
24
25 {-# LANGUAGE NoImplicitPrelude #-}
26 {-# LANGUAGE OverloadedStrings #-}
27
28 module Gargantext.Text.Metrics.Occurrences
29 where
30
31 import Gargantext.Prelude
32
33 import Data.Map.Strict (Map
34 , empty
35 , insertWith, unionWith
36 )
37
38 import Control.Monad ((>>),(>>=))
39 import Data.String (String())
40 import Data.Attoparsec.Text
41 import Data.Text (Text)
42
43 import Data.Either.Extra(Either(..))
44 import qualified Data.Text as T
45 import Control.Applicative hiding (empty)
46 -----------------------------------------------------------
47
48 type Occ = Int
49
50 -- | Compute the occurrences (occ)
51 occ :: Ord a => [a] -> Map a Occ
52 occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
53
54 -- TODO add groups and filter stops
55 sumOcc :: Ord a => [Map a Occ] -> Map a Occ
56 sumOcc xs = foldl' (unionWith (+)) empty xs
57
58
59 occurrenceParser :: Text -> Parser Bool
60 occurrenceParser txt = manyTill anyChar (string txt) >> pure True
61
62 occurrencesParser :: Text -> Parser Int
63 occurrencesParser txt = case txt of
64 "" -> pure 0
65 _ -> many (occurrenceParser txt') >>= \matches -> pure (length matches)
66 where
67 txt' = T.toLower txt
68
69 parseOccurrences :: Text -> Text -> Either String Int
70 parseOccurrences x = parseOnly (occurrencesParser x)