]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Metrics/Count.hs
Some progress on workflow, the data2graph fails because of unknown nodes, see the...
[gargantext.git] / src / Gargantext / Text / Metrics / Count.hs
1 {-|
2 Module : Gargantext.Text.Metrics.Count
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Token and occurrence
11
12 An occurrence is not necessarily a token. Considering the sentence:
13 "A rose is a rose is a rose". We may equally correctly state that there
14 are eight or three words in the sentence. There are, in fact, three word
15 types in the sentence: "rose", "is" and "a". There are eight word tokens
16 in a token copy of the line. The line itself is a type. There are not
17 eight word types in the line. It contains (as stated) only the three
18 word types, 'a', 'is' and 'rose', each of which is unique. So what do we
19 call what there are eight of? They are occurrences of words. There are
20 three occurrences of the word type 'a', two of 'is' and three of 'rose'.
21 Source : https://en.wikipedia.org/wiki/Type%E2%80%93token_distinction#Occurrences
22
23 -}
24
25 {-# LANGUAGE NoImplicitPrelude #-}
26 {-# LANGUAGE OverloadedStrings #-}
27
28 module Gargantext.Text.Metrics.Count
29 where
30
31
32 import Control.Arrow ((***))
33 import qualified Data.List as List
34 import Data.Map.Strict (Map
35 , empty, singleton
36 , insertWith, insertWithKey, unionWith
37 , toList, lookup, mapKeys
38 )
39 import Data.Set (Set)
40 import qualified Data.Set as Set
41 import Data.Text (pack)
42
43 import qualified Data.Map.Strict as DMS
44 import Control.Monad ((>>),(>>=))
45 import Data.String (String())
46 import Data.Attoparsec.Text
47
48 ------------------------------------------------------------------------
49 import Gargantext.Prelude
50 import Gargantext.Core.Types
51 ------------------------------------------------------------------------
52 type Occ a = Map a Int
53 type Cooc a = Map (a, a) Int
54 type FIS a = Map (Set a) Int
55
56 data Group = ByStem | ByOntology
57
58 type Grouped = Stems
59
60
61 {-
62 -- >> let testData = ["blue lagoon", "blues lagoon", "red lagoon"]
63 -- >> map occurrences <$> Prelude.mapM (terms Mono EN)
64 -- [fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["lagoon"],1),(fromList ["red"],1)]]
65 --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon"]
66 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),1)]
67 --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon", "red lagoon"]
68 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
69 --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon red lagoon", "red lagoon"]
70 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
71 --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon blues lagoon", "red lagoon red lagoon", "red lagoon"]
72 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
73 ----
74 -}
75
76 type Occs = Int
77 type Coocs = Int
78
79 removeApax :: Map (Label, Label) Int -> Map (Label, Label) Int
80 removeApax = DMS.filter (> 1)
81
82 cooc :: [[Terms]] -> Map (Label, Label) Int
83 cooc tss = coocOnWithLabel _terms_stem (useLabelPolicy label_policy) tss
84 where
85 terms_occs = occurrencesOn _terms_stem (List.concat tss)
86 label_policy = mkLabelPolicy terms_occs
87
88
89 coocOnWithLabel :: (Ord label, Ord b) => (a -> b) -> (b -> label)
90 -> [[a]] -> Map (label, label) Coocs
91 coocOnWithLabel on policy tss =
92 mapKeys (delta policy) $ coocOn on tss
93 where
94 delta f = f *** f
95
96
97 mkLabelPolicy :: Map Grouped (Map Terms Occs) -> Map Grouped Label
98 mkLabelPolicy = DMS.map f where
99 f = _terms_label . fst . maximumWith snd . DMS.toList
100 -- TODO use the Foldable instance of Map instead of building a list
101
102 useLabelPolicy :: Map Grouped Label -> Grouped -> Label
103 useLabelPolicy m g = case DMS.lookup g m of
104 Just label -> label
105 Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g)
106 {-
107 labelPolicy :: Map Grouped (Map Terms Occs) -> Grouped -> Label
108 labelPolicy m g = case _terms_label <$> fst <$> maximumWith snd <$> DMS.toList <$> lookup g m of
109 Just label -> label
110 Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g)
111 -}
112
113 coocOn :: Ord b => (a -> b) -> [[a]] -> Map (b, b) Coocs
114 coocOn f as = foldl' (\a b -> DMS.unionWith (+) a b) empty $ map (coocOn' f) as
115 where
116 coocOn' :: Ord b => (a -> b) -> [a] -> Map (b, b) Coocs
117 coocOn' f ts = foldl' (\m (xy,c) -> insertWith ((+)) xy c m) empty xs
118 where
119 ts' = List.nub $ map f ts
120 xs = [ ((x, y), 1)
121 | x <- ts'
122 , y <- ts'
123 -- , x /= y
124 ]
125
126
127 -- | Compute the grouped occurrences (occ)
128 occurrences :: [Terms] -> Map Grouped (Map Terms Int)
129 occurrences = occurrencesOn _terms_stem
130
131 occurrencesOn :: (Ord a, Ord b) => (a -> b) -> [a] -> Map b (Map a Int)
132 occurrencesOn f = foldl' (\m a -> insertWith (unionWith (+)) (f a) (singleton a 1) m) empty
133
134 -- TODO add groups and filter stops
135
136 sumOcc :: Ord a => [Occ a] -> Occ a
137 sumOcc xs = foldl' (unionWith (+)) empty xs
138
139