]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Metrics/Count.hs
[FIX][DB][FLOW] insert listngrams.
[gargantext.git] / src / Gargantext / Text / Metrics / Count.hs
1 {-|
2 Module : Gargantext.Text.Metrics.Count
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Token and occurrence
11
12 An occurrence is not necessarily a token. Considering the sentence:
13 "A rose is a rose is a rose". We may equally correctly state that there
14 are eight or three words in the sentence. There are, in fact, three word
15 types in the sentence: "rose", "is" and "a". There are eight word tokens
16 in a token copy of the line. The line itself is a type. There are not
17 eight word types in the line. It contains (as stated) only the three
18 word types, 'a', 'is' and 'rose', each of which is unique. So what do we
19 call what there are eight of? They are occurrences of words. There are
20 three occurrences of the word type 'a', two of 'is' and three of 'rose'.
21 Source : https://en.wikipedia.org/wiki/Type%E2%80%93token_distinction#Occurrences
22
23 -}
24
25 {-# LANGUAGE NoImplicitPrelude #-}
26 {-# LANGUAGE OverloadedStrings #-}
27
28 module Gargantext.Text.Metrics.Count
29 where
30
31 import Data.Text (Text)
32 import Control.Arrow (Arrow(..), (***))
33 import qualified Data.List as List
34
35 import qualified Data.Map.Strict as DMS
36 import Data.Map.Strict ( Map, empty, singleton
37 , insertWith, unionWith, unionsWith
38 , mapKeys
39 )
40 import Data.Set (Set)
41 import Data.Text (pack)
42
43
44 ------------------------------------------------------------------------
45 import Gargantext.Prelude
46 import Gargantext.Core.Types
47 ------------------------------------------------------------------------
48 type Occ a = Map a Int
49 type Cooc a = Map (a, a) Int
50 type FIS a = Map (Set a) Int
51
52 data Group = ByStem | ByOntology
53
54 type Grouped = Stems
55
56
57 {-
58 -- >> let testData = ["blue lagoon", "blues lagoon", "red lagoon"]
59 -- >> map occurrences <$> Prelude.mapM (terms Mono EN)
60 -- [fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["lagoon"],1),(fromList ["red"],1)]]
61 --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon"]
62 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),1)]
63 --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon", "red lagoon"]
64 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
65 --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon red lagoon", "red lagoon"]
66 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
67 --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon blues lagoon", "red lagoon red lagoon", "red lagoon"]
68 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
69 ----
70 -}
71
72 type Occs = Int
73 type Coocs = Int
74 type Threshold = Int
75
76 removeApax :: Threshold -> Map ([Text], [Text]) Int -> Map ([Text], [Text]) Int
77 removeApax t = DMS.filter (> t)
78
79 cooc :: [[Terms]] -> Map ([Text], [Text]) Int
80 cooc tss = coocOnWithLabel _terms_stem (useLabelPolicy label_policy) tss
81 where
82 terms_occs = occurrencesOn _terms_stem (List.concat tss)
83 label_policy = mkLabelPolicy terms_occs
84
85
86 coocOnWithLabel :: (Ord label, Ord b) => (a -> b) -> (b -> label)
87 -> [[a]] -> Map (label, label) Coocs
88 coocOnWithLabel on' policy tss = mapKeys (delta policy) $ coocOn on' tss
89 where
90 delta :: Arrow a => a b' c' -> a (b', b') (c', c')
91 delta f = f *** f
92
93
94 mkLabelPolicy :: Map Grouped (Map Terms Occs) -> Map Grouped [Text]
95 mkLabelPolicy = DMS.map f where
96 f = _terms_label . fst . maximumWith snd . DMS.toList
97 -- TODO use the Foldable instance of Map instead of building a list
98
99 useLabelPolicy :: Map Grouped [Text] -> Grouped -> [Text]
100 useLabelPolicy m g = case DMS.lookup g m of
101 Just label -> label
102 Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g)
103 -- TODO: use a non-fatal error if this can happen in practice
104 {-
105 labelPolicy :: Map Grouped (Map Terms Occs) -> Grouped -> Label
106 labelPolicy m g = case _terms_label <$> fst <$> maximumWith snd <$> DMS.toList <$> lookup g m of
107 Just label -> label
108 Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g)
109 -}
110
111 coocOn :: Ord b => (a -> b) -> [[a]] -> Map (b, b) Coocs
112 coocOn f as = DMS.unionsWith (+) $ map (coocOn' f) as
113
114 coocOn' :: Ord b => (a -> b) -> [a] -> Map (b, b) Coocs
115 coocOn' fun ts = DMS.fromListWith (+) xs
116 where
117 ts' = List.nub $ map fun ts
118 xs = [ ((x, y), 1)
119 | x <- ts'
120 , y <- ts'
121 , x >= y
122 ]
123
124 ------------------------------------------------------------------------
125
126 coocOnContexts :: (a -> [Text]) -> [[a]] -> Map ([Text], [Text]) Int
127 coocOnContexts fun = DMS.fromListWith (+) . List.concat . map (coocOnSingleContext fun)
128
129 coocOnSingleContext :: (a -> [Text]) -> [a] -> [(([Text], [Text]), Int)]
130 coocOnSingleContext fun ts = xs
131 where
132 ts' = List.nub $ map fun ts
133 xs = [ ((x, y), 1)
134 | x <- ts'
135 , y <- ts'
136 , x >= y
137 ]
138 ------------------------------------------------------------------------
139
140
141 -- | Compute the grouped occurrences (occ)
142 occurrences :: [Terms] -> Map Grouped (Map Terms Int)
143 occurrences = occurrencesOn _terms_stem
144
145 occurrencesOn :: (Ord a, Ord b) => (a -> b) -> [a] -> Map b (Map a Int)
146 occurrencesOn f = foldl' (\m a -> insertWith (unionWith (+)) (f a) (singleton a 1) m) empty
147
148 occurrencesWith :: (Foldable list, Ord k, Num a) => (b -> k) -> list b -> Map k a
149 occurrencesWith f xs = foldl' (\x y -> insertWith (+) (f y) 1 x) empty xs
150
151 -- TODO add groups and filter stops
152
153 sumOcc :: Ord a => [Occ a] -> Occ a
154 sumOcc xs = unionsWith (+) xs
155
156