src/Gargantext/Text/Metrics/Occurrences.hs

   1 {-|
   2 Module      : Gargantext.Text.Metrics.Occurrences
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Token and occurrence
  11
  12 An occurrence is not necessarily a token. Considering the sentence:
  13 "A rose is a rose is a rose". We may equally correctly state that there
  14 are eight or three words in the sentence. There are, in fact, three word
  15 types in the sentence: "rose", "is" and "a". There are eight word tokens
  16 in a token copy of the line. The line itself is a type. There are not
  17 eight word types in the line. It contains (as stated) only the three
  18 word types, 'a', 'is' and 'rose', each of which is unique. So what do we
  19 call what there are eight of? They are occurrences of words. There are
  20 three occurrences of the word type 'a', two of 'is' and three of 'rose'.
  21 Source : https://en.wikipedia.org/wiki/Type%E2%80%93token_distinction#Occurrences
  22
  23 -}
  24
  25 {-# LANGUAGE NoImplicitPrelude #-}
  26 {-# LANGUAGE OverloadedStrings #-}
  27
  28 module Gargantext.Text.Metrics.Occurrences
  29   where
  30
  31 import Gargantext.Prelude
  32
  33 import Data.Map.Strict  (Map
  34                         , empty
  35                         , insertWith, unionWith
  36                         )
  37
  38 import Control.Monad ((>>),(>>=))
  39 import Data.String (String())
  40 import Data.Attoparsec.Text
  41 import Data.Text (Text)
  42
  43 import Data.Either.Extra(Either(..))
  44 import qualified Data.Text as T
  45 import Control.Applicative hiding (empty)
  46 -----------------------------------------------------------
  47
  48 type Occ = Int
  49
  50 -- | Compute the occurrences (occ)
  51 occ :: Ord a => [a] -> Map a Occ
  52 occ xs = foldl' (\x y -> insertWith (+) y 1 x) empty xs
  53
  54 -- TODO add groups and filter stops
  55 sumOcc :: Ord a => [Map a Occ] -> Map a Occ
  56 sumOcc xs = foldl' (unionWith (+)) empty xs
  57
  58
  59 occurrenceParser :: Text -> Parser Bool
  60 occurrenceParser txt = manyTill anyChar (string txt) >> pure True
  61
  62 occurrencesParser :: Text -> Parser Int
  63 occurrencesParser txt = case txt of
  64                     "" -> pure 0
  65                     _  -> many (occurrenceParser txt') >>= \matches -> pure (length matches)
  66     where
  67         txt' = T.toLower txt
  68
  69 parseOccurrences :: Text -> Text -> Either String Int
  70 parseOccurrences x = parseOnly (occurrencesParser x)