2 Module : Gargantext.Text.Metrics.Count
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 An occurrence is not necessarily a token. Considering the sentence:
13 "A rose is a rose is a rose". We may equally correctly state that there
14 are eight or three words in the sentence. There are, in fact, three word
15 types in the sentence: "rose", "is" and "a". There are eight word tokens
16 in a token copy of the line. The line itself is a type. There are not
17 eight word types in the line. It contains (as stated) only the three
18 word types, 'a', 'is' and 'rose', each of which is unique. So what do we
19 call what there are eight of? They are occurrences of words. There are
20 three occurrences of the word type 'a', two of 'is' and three of 'rose'.
21 Source : https://en.wikipedia.org/wiki/Type%E2%80%93token_distinction#Occurrences
25 {-# LANGUAGE NoImplicitPrelude #-}
26 {-# LANGUAGE OverloadedStrings #-}
28 module Gargantext.Text.Metrics.Count
32 import Control.Arrow ((***))
33 import qualified Data.List as List
34 import Data.Map.Strict (Map
36 , insertWith, insertWithKey, unionWith
37 , toList, lookup, mapKeys
40 import qualified Data.Set as Set
41 import Data.Text (pack)
43 import qualified Data.Map.Strict as DMS
44 import Control.Monad ((>>),(>>=))
45 import Data.String (String())
46 import Data.Attoparsec.Text
48 ------------------------------------------------------------------------
49 import Gargantext.Prelude
50 import Gargantext.Core.Types
51 ------------------------------------------------------------------------
52 type Occ a = Map a Int
53 type Cooc a = Map (a, a) Int
54 type FIS a = Map (Set a) Int
56 data Group = ByStem | ByOntology
62 -- >> let testData = ["blue lagoon", "blues lagoon", "red lagoon"]
63 -- >> map occurrences <$> Prelude.mapM (terms Mono EN)
64 -- [fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["lagoon"],1),(fromList ["red"],1)]]
65 --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon"]
66 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),1)]
67 --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon", "red lagoon"]
68 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
69 --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon red lagoon", "red lagoon"]
70 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
71 --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon blues lagoon", "red lagoon red lagoon", "red lagoon"]
72 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
79 removeApax :: Map (Label, Label) Int -> Map (Label, Label) Int
80 removeApax = DMS.filter (> 1)
82 cooc :: [[Terms]] -> Map (Label, Label) Int
83 cooc tss = coocOnWithLabel _terms_stem (labelPolicy terms_occs) tss
85 terms_occs = occurrencesOn _terms_stem (List.concat tss)
88 coocOnWithLabel :: (Ord label, Ord b) => (a -> b) -> (b -> label)
89 -> [[a]] -> Map (label, label) Coocs
90 coocOnWithLabel on policy tss =
91 mapKeys (delta policy) $ coocOn on tss
96 labelPolicy :: Map Grouped (Map Terms Occs) -> Grouped -> Label
97 labelPolicy m g = case _terms_label <$> fst <$> maximumWith snd <$> DMS.toList <$> lookup g m of
99 Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g)
101 coocOn :: Ord b => (a -> b) -> [[a]] -> Map (b, b) Coocs
102 coocOn f as = foldl' (\a b -> DMS.unionWith (+) a b) empty $ map (coocOn' f) as
104 coocOn' :: Ord b => (a -> b) -> [a] -> Map (b, b) Coocs
105 coocOn' f ts = foldl' (\m (xy,c) -> insertWith ((+)) xy c m) empty xs
107 ts' = List.nub $ map f ts
115 -- | Compute the grouped occurrences (occ)
116 occurrences :: [Terms] -> Map Grouped (Map Terms Int)
117 occurrences = occurrencesOn _terms_stem
119 occurrencesOn :: (Ord a, Ord b) => (a -> b) -> [a] -> Map b (Map a Int)
120 occurrencesOn f = foldl' (\m a -> insertWith (unionWith (+)) (f a) (singleton a 1) m) empty
122 -- TODO add groups and filter stops
124 sumOcc :: Ord a => [Occ a] -> Occ a
125 sumOcc xs = foldl' (unionWith (+)) empty xs