2 Module : Gargantext.Text.Metrics.Count
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 An occurrence is not necessarily a token. Considering the sentence:
13 "A rose is a rose is a rose". We may equally correctly state that there
14 are eight or three words in the sentence. There are, in fact, three word
15 types in the sentence: "rose", "is" and "a". There are eight word tokens
16 in a token copy of the line. The line itself is a type. There are not
17 eight word types in the line. It contains (as stated) only the three
18 word types, 'a', 'is' and 'rose', each of which is unique. So what do we
19 call what there are eight of? They are occurrences of words. There are
20 three occurrences of the word type 'a', two of 'is' and three of 'rose'.
21 Source : https://en.wikipedia.org/wiki/Type%E2%80%93token_distinction#Occurrences
25 {-# LANGUAGE NoImplicitPrelude #-}
26 {-# LANGUAGE OverloadedStrings #-}
28 module Gargantext.Text.Metrics.Count
32 import Control.Arrow (Arrow(..), (***))
33 import qualified Data.List as List
35 import qualified Data.Map.Strict as DMS
36 import Data.Map.Strict ( Map, empty, singleton
37 , insertWith, unionWith
41 import Data.Text (pack)
44 ------------------------------------------------------------------------
45 import Gargantext.Prelude
46 import Gargantext.Core.Types
47 ------------------------------------------------------------------------
48 type Occ a = Map a Int
49 type Cooc a = Map (a, a) Int
50 type FIS a = Map (Set a) Int
52 data Group = ByStem | ByOntology
58 -- >> let testData = ["blue lagoon", "blues lagoon", "red lagoon"]
59 -- >> map occurrences <$> Prelude.mapM (terms Mono EN)
60 -- [fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["lagoon"],1),(fromList ["red"],1)]]
61 --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon"]
62 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),1)]
63 --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon", "red lagoon"]
64 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
65 --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon red lagoon", "red lagoon"]
66 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
67 --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon blues lagoon", "red lagoon red lagoon", "red lagoon"]
68 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
76 removeApax :: Threshold -> Map (Label, Label) Int -> Map (Label, Label) Int
77 removeApax t = DMS.filter (> t)
79 cooc :: [[Terms]] -> Map (Label, Label) Int
80 cooc tss = coocOnWithLabel _terms_stem (useLabelPolicy label_policy) tss
82 terms_occs = occurrencesOn _terms_stem (List.concat tss)
83 label_policy = mkLabelPolicy terms_occs
86 coocOnWithLabel :: (Ord label, Ord b) => (a -> b) -> (b -> label)
87 -> [[a]] -> Map (label, label) Coocs
88 coocOnWithLabel on' policy tss = mapKeys (delta policy) $ coocOn on' tss
90 delta :: Arrow a => a b' c' -> a (b', b') (c', c')
94 mkLabelPolicy :: Map Grouped (Map Terms Occs) -> Map Grouped Label
95 mkLabelPolicy = DMS.map f where
96 f = _terms_label . fst . maximumWith snd . DMS.toList
97 -- TODO use the Foldable instance of Map instead of building a list
99 useLabelPolicy :: Map Grouped Label -> Grouped -> Label
100 useLabelPolicy m g = case DMS.lookup g m of
102 Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g)
104 labelPolicy :: Map Grouped (Map Terms Occs) -> Grouped -> Label
105 labelPolicy m g = case _terms_label <$> fst <$> maximumWith snd <$> DMS.toList <$> lookup g m of
107 Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g)
110 coocOn :: Ord b => (a -> b) -> [[a]] -> Map (b, b) Coocs
111 coocOn f as = foldl' (\a b -> DMS.unionWith (+) a b) empty $ map (coocOn' f) as
113 coocOn' :: Ord b => (a -> b) -> [a] -> Map (b, b) Coocs
114 coocOn' fun ts = foldl' (\m (xy,c) -> insertWith ((+)) xy c m) empty xs
116 ts' = List.nub $ map fun ts
124 -- | Compute the grouped occurrences (occ)
125 occurrences :: [Terms] -> Map Grouped (Map Terms Int)
126 occurrences = occurrencesOn _terms_stem
128 occurrencesOn :: (Ord a, Ord b) => (a -> b) -> [a] -> Map b (Map a Int)
129 occurrencesOn f = foldl' (\m a -> insertWith (unionWith (+)) (f a) (singleton a 1) m) empty
131 -- TODO add groups and filter stops
133 sumOcc :: Ord a => [Occ a] -> Occ a
134 sumOcc xs = foldl' (unionWith (+)) empty xs