]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Metrics/Count.hs
[nlp] corenlps/johnsnows/spacys for https versions of schema
[gargantext.git] / src / Gargantext / Core / Text / Metrics / Count.hs
1 {-|
2 Module : Gargantext.Core.Text.Metrics.Count
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Token and occurrence
11
12 An occurrence is not necessarily a token. Considering the sentence:
13 "A rose is a rose is a rose". We may equally correctly state that there
14 are eight or three words in the sentence. There are, in fact, three word
15 types in the sentence: "rose", "is" and "a". There are eight word tokens
16 in a token copy of the line. The line itself is a type. There are not
17 eight word types in the line. It contains (as stated) only the three
18 word types, 'a', 'is' and 'rose', each of which is unique. So what do we
19 call what there are eight of? They are occurrences of words. There are
20 three occurrences of the word type 'a', two of 'is' and three of 'rose'.
21 Source : https://en.wikipedia.org/wiki/Type%E2%80%93token_distinction#Occurrences
22
23 -}
24
25
26 module Gargantext.Core.Text.Metrics.Count
27 where
28
29 import Debug.Trace (trace)
30 import Data.Text (Text)
31 import Control.Arrow (Arrow(..), (***))
32 import qualified Data.List as List
33
34 import qualified Data.Map.Strict as DMS
35 import Data.Map.Strict ( Map, empty, singleton
36 , insertWith, unionWith, unionsWith
37 , mapKeys
38 )
39 import Data.Set (Set)
40 import Data.Text (pack)
41
42
43 ------------------------------------------------------------------------
44 import Gargantext.Prelude
45 import Gargantext.Core.Types
46 ------------------------------------------------------------------------
47 type Occ a = Map a Int
48 type Cooc a = Map (a, a) Int
49 type FIS a = Map (Set a) Int
50
51 data Group = ByStem | ByOntology
52
53 type Grouped = Stems
54
55
56 {-
57 -- >> let testData = ["blue lagoon", "blues lagoon", "red lagoon"]
58 -- >> map occurrences <$> Prelude.mapM (terms Mono EN)
59 -- [fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["lagoon"],1),(fromList ["red"],1)]]
60 --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon"]
61 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),1)]
62 --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon", "red lagoon"]
63 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
64 --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon red lagoon", "red lagoon"]
65 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
66 --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon blues lagoon", "red lagoon red lagoon", "red lagoon"]
67 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
68 ----
69 -}
70
71 type Occs = Int
72 type Coocs = Int
73 type Threshold = Int
74
75 removeApax :: Threshold -> Map ([Text], [Text]) Int -> Map ([Text], [Text]) Int
76 removeApax t = DMS.filter (> t)
77
78 cooc :: [[Terms]] -> Map ([Text], [Text]) Int
79 cooc tss = coocOnWithLabel _terms_stem (useLabelPolicy label_policy) tss
80 where
81 terms_occs = occurrencesOn _terms_stem (List.concat tss)
82 label_policy = mkLabelPolicy terms_occs
83
84
85 coocOnWithLabel :: (Ord label, Ord b) => (a -> b) -> (b -> label)
86 -> [[a]] -> Map (label, label) Coocs
87 coocOnWithLabel on' policy tss = mapKeys (delta policy) $ coocOn on' tss
88 where
89 delta :: Arrow a => a b' c' -> a (b', b') (c', c')
90 delta f = f *** f
91
92
93 mkLabelPolicy :: Map Grouped (Map Terms Occs) -> Map Grouped [Text]
94 mkLabelPolicy = DMS.map f where
95 f = _terms_label . fst . maximumWith snd . DMS.toList
96 -- TODO use the Foldable instance of Map instead of building a list
97
98 useLabelPolicy :: Map Grouped [Text] -> Grouped -> [Text]
99 useLabelPolicy m g = case DMS.lookup g m of
100 Just label -> label
101 Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g)
102 -- TODO: use a non-fatal error if this can happen in practice
103 {-
104 labelPolicy :: Map Grouped (Map Terms Occs) -> Grouped -> Label
105 labelPolicy m g = case _terms_label <$> fst <$> maximumWith snd <$> DMS.toList <$> lookup g m of
106 Just label -> label
107 Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g)
108 -}
109
110 coocOn :: Ord b => (a -> b) -> [[a]] -> Map (b, b) Int
111 coocOn f as = DMS.unionsWith (+) $ map (coocOn' f) as
112
113 coocOn' :: Ord b => (a -> b) -> [a] -> Map (b, b) Int
114 coocOn' fun ts = DMS.fromListWith (+) xs
115 where
116 ts' = List.nub $ map fun ts
117 xs = [ ((x, y), 1)
118 | x <- ts'
119 , y <- ts'
120 , x >= y
121 ]
122
123
124 ------------------------------------------------------------------------
125 coocOnContexts :: (a -> [Text]) -> [[a]] -> Map ([Text], [Text]) Int
126 coocOnContexts fun = DMS.fromListWith (+) . List.concat . map (coocOnSingleContext fun)
127
128 coocOnSingleContext :: (a -> [Text]) -> [a] -> [(([Text], [Text]), Int)]
129 coocOnSingleContext fun ts = xs
130 where
131 ts' = List.nub $ map fun ts
132 xs = [ ((x, y), 1)
133 | x <- ts'
134 , y <- ts'
135 , x >= y
136 ]
137 ------------------------------------------------------------------------
138
139
140 -- | Compute the grouped occurrences (occ)
141 occurrences :: [Terms] -> Map Grouped (Map Terms Int)
142 occurrences = occurrencesOn _terms_stem
143
144 occurrencesOn :: (Ord a, Ord b) => (a -> b) -> [a] -> Map b (Map a Int)
145 occurrencesOn f = foldl' (\m a -> insertWith (unionWith (+)) (f a) (singleton a 1) m) empty
146
147 occurrencesWith :: (Foldable list, Ord k, Num a, Show k, Show a, Show (list b)) => (b -> k) -> list b -> Map k a
148 occurrencesWith f xs = trace (show (xs,m)) m
149 where
150 m = foldl' (\x y -> insertWith (+) (f y) 1 x) empty xs
151
152 -- TODO add groups and filter stops
153
154 sumOcc :: Ord a => [Occ a] -> Occ a
155 sumOcc xs = unionsWith (+) xs
156
157