]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Metrics/Count.hs
[conduit] some work towards migrating file parser to conduit (does not compile)
[gargantext.git] / src / Gargantext / Core / Text / Metrics / Count.hs
1 {-|
2 Module : Gargantext.Core.Text.Metrics.Count
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Token and occurrence
11
12 An occurrence is not necessarily a token. Considering the sentence:
13 "A rose is a rose is a rose". We may equally correctly state that there
14 are eight or three words in the sentence. There are, in fact, three word
15 types in the sentence: "rose", "is" and "a". There are eight word tokens
16 in a token copy of the line. The line itself is a type. There are not
17 eight word types in the line. It contains (as stated) only the three
18 word types, 'a', 'is' and 'rose', each of which is unique. So what do we
19 call what there are eight of? They are occurrences of words. There are
20 three occurrences of the word type 'a', two of 'is' and three of 'rose'.
21 Source : https://en.wikipedia.org/wiki/Type%E2%80%93token_distinction#Occurrences
22
23 -}
24
25
26 module Gargantext.Core.Text.Metrics.Count
27 where
28
29 import Data.Text (Text)
30 import Control.Arrow (Arrow(..), (***))
31 import qualified Data.List as List
32
33 import qualified Data.Map.Strict as DMS
34 import Data.Map.Strict ( Map, empty, singleton
35 , insertWith, unionWith, unionsWith
36 , mapKeys
37 )
38 import Data.Set (Set)
39 import Data.Text (pack)
40
41
42 ------------------------------------------------------------------------
43 import Gargantext.Prelude
44 import Gargantext.Core.Types
45 ------------------------------------------------------------------------
46 type Occ a = Map a Int
47 type Cooc a = Map (a, a) Int
48 type FIS a = Map (Set a) Int
49
50 data Group = ByStem | ByOntology
51
52 type Grouped = Stems
53
54
55 {-
56 -- >> let testData = ["blue lagoon", "blues lagoon", "red lagoon"]
57 -- >> map occurrences <$> Prelude.mapM (terms Mono EN)
58 -- [fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["lagoon"],1),(fromList ["red"],1)]]
59 --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon"]
60 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),1)]
61 --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon", "red lagoon"]
62 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
63 --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon red lagoon", "red lagoon"]
64 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
65 --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon blues lagoon", "red lagoon red lagoon", "red lagoon"]
66 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
67 ----
68 -}
69
70 type Occs = Int
71 type Coocs = Int
72 type Threshold = Int
73
74 removeApax :: Threshold -> Map ([Text], [Text]) Int -> Map ([Text], [Text]) Int
75 removeApax t = DMS.filter (> t)
76
77 cooc :: [[Terms]] -> Map ([Text], [Text]) Int
78 cooc tss = coocOnWithLabel _terms_stem (useLabelPolicy label_policy) tss
79 where
80 terms_occs = occurrencesOn _terms_stem (List.concat tss)
81 label_policy = mkLabelPolicy terms_occs
82
83
84 coocOnWithLabel :: (Ord label, Ord b) => (a -> b) -> (b -> label)
85 -> [[a]] -> Map (label, label) Coocs
86 coocOnWithLabel on' policy tss = mapKeys (delta policy) $ coocOn on' tss
87 where
88 delta :: Arrow a => a b' c' -> a (b', b') (c', c')
89 delta f = f *** f
90
91
92 mkLabelPolicy :: Map Grouped (Map Terms Occs) -> Map Grouped [Text]
93 mkLabelPolicy = DMS.map f where
94 f = _terms_label . fst . maximumWith snd . DMS.toList
95 -- TODO use the Foldable instance of Map instead of building a list
96
97 useLabelPolicy :: Map Grouped [Text] -> Grouped -> [Text]
98 useLabelPolicy m g = case DMS.lookup g m of
99 Just label -> label
100 Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g)
101 -- TODO: use a non-fatal error if this can happen in practice
102 {-
103 labelPolicy :: Map Grouped (Map Terms Occs) -> Grouped -> Label
104 labelPolicy m g = case _terms_label <$> fst <$> maximumWith snd <$> DMS.toList <$> lookup g m of
105 Just label -> label
106 Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g)
107 -}
108
109 coocOn :: Ord b => (a -> b) -> [[a]] -> Map (b, b) Int
110 coocOn f as = DMS.unionsWith (+) $ map (coocOn' f) as
111
112 coocOn' :: Ord b => (a -> b) -> [a] -> Map (b, b) Int
113 coocOn' fun ts = DMS.fromListWith (+) xs
114 where
115 ts' = List.nub $ map fun ts
116 xs = [ ((x, y), 1)
117 | x <- ts'
118 , y <- ts'
119 , x >= y
120 ]
121
122
123 ------------------------------------------------------------------------
124 coocOnContexts :: (a -> [Text]) -> [[a]] -> Map ([Text], [Text]) Int
125 coocOnContexts fun = DMS.fromListWith (+) . List.concat . map (coocOnSingleContext fun)
126
127 coocOnSingleContext :: (a -> [Text]) -> [a] -> [(([Text], [Text]), Int)]
128 coocOnSingleContext fun ts = xs
129 where
130 ts' = List.nub $ map fun ts
131 xs = [ ((x, y), 1)
132 | x <- ts'
133 , y <- ts'
134 , x >= y
135 ]
136 ------------------------------------------------------------------------
137
138
139 -- | Compute the grouped occurrences (occ)
140 occurrences :: [Terms] -> Map Grouped (Map Terms Int)
141 occurrences = occurrencesOn _terms_stem
142
143 occurrencesOn :: (Ord a, Ord b) => (a -> b) -> [a] -> Map b (Map a Int)
144 occurrencesOn f = foldl' (\m a -> insertWith (unionWith (+)) (f a) (singleton a 1) m) empty
145
146 occurrencesWith :: (Foldable list, Ord k, Num a) => (b -> k) -> list b -> Map k a
147 occurrencesWith f xs = foldl' (\x y -> insertWith (+) (f y) 1 x) empty xs
148
149 -- TODO add groups and filter stops
150
151 sumOcc :: Ord a => [Occ a] -> Occ a
152 sumOcc xs = unionsWith (+) xs
153
154