]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Metrics/Occurrences.hs
[FEAT] adding pipeline.
[gargantext.git] / src / Gargantext / Text / Metrics / Occurrences.hs
1 {-|
2 Module : Gargantext.Text.Metrics.Occurrences
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Token and occurrence
11
12 An occurrence is not necessarily a token. Considering the sentence:
13 "A rose is a rose is a rose". We may equally correctly state that there
14 are eight or three words in the sentence. There are, in fact, three word
15 types in the sentence: "rose", "is" and "a". There are eight word tokens
16 in a token copy of the line. The line itself is a type. There are not
17 eight word types in the line. It contains (as stated) only the three
18 word types, 'a', 'is' and 'rose', each of which is unique. So what do we
19 call what there are eight of? They are occurrences of words. There are
20 three occurrences of the word type 'a', two of 'is' and three of 'rose'.
21 Source : https://en.wikipedia.org/wiki/Type%E2%80%93token_distinction#Occurrences
22
23 -}
24
25 {-# LANGUAGE NoImplicitPrelude #-}
26 {-# LANGUAGE OverloadedStrings #-}
27
28 module Gargantext.Text.Metrics.Occurrences
29 where
30
31
32 import Data.Map.Strict (Map
33 , empty
34 , insertWith, insertWithKey, unionWith
35 , toList
36 )
37 import Data.Set (Set)
38 import qualified Data.Map.Strict as DMS
39 import Control.Monad ((>>),(>>=))
40 import Data.String (String())
41 import Data.Attoparsec.Text
42
43 ------------------------------------------------------------------------
44 import Gargantext.Prelude
45 import Gargantext.Core.Types
46 ------------------------------------------------------------------------
47 type Occ a = Map a Int
48 type Cooc a = Map (a, a) Int
49 type FIS a = Map (Set a) Int
50
51 data Group = ByStem | ByOntology
52
53 type Grouped = Stems
54
55
56 -- >> let testData = ["blue lagoon", "blues lagoon", "red lagoon"]
57 -- >> map occurrences <$> Prelude.mapM (terms Mono EN)
58 -- [fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["blue"],1),(fromList ["lagoon"],1)],fromList [(fromList ["lagoon"],1),(fromList ["red"],1)]]
59 --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon"]
60 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),1)]
61 --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon", "red lagoon"]
62 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
63 --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon", "red lagoon red lagoon", "red lagoon"]
64 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
65 --λ: cooc <$> Prelude.map occurrences <$> Prelude.mapM (terms Mono EN) ["blue lagoon", "blues lagoon blues lagoon", "red lagoon red lagoon", "red lagoon"]
66 --fromList [((fromList ["blue"],fromList ["lagoon"]),2),((fromList ["lagoon"],fromList ["red"]),2)]
67 ----
68
69 cooc :: (Ord b, Num a) => [Map b a] -> Map (b, b) a
70 cooc ts = cooc' $ map cooc'' ts
71
72 cooc' :: (Ord b, Num a) => [Map (b, b) a] -> Map (b,b) a
73 cooc' = foldl' (\x y -> unionWith (+) x y) empty
74
75 cooc'' :: (Ord b, Num a) => Map b a -> Map (b, b) a
76 cooc'' m = foldl' (\x (y,c) -> insertWith (+) y c x) empty xs
77 where
78 xs =[ ((x'',y''), c') | x' <- toList m
79 , y' <- toList m
80 , let x'' = fst x'
81 , let y'' = fst y'
82 , x'' < y''
83 , let c' = 1
84 --, let c' = snd x' + snd y'
85 ]
86
87
88 -- | Compute the grouped occurrences (occ)
89 occurrences :: [Terms] -> Map Grouped Int
90 occurrences = occurrences' _terms_stem
91
92 occurrences' :: Ord b => (a -> b) -> [a] -> Occ b
93 occurrences' f xs = foldl' (\x y -> insertWith (+) (f y) 1 x) empty xs
94
95
96 -- TODO add groups and filter stops
97 sumOcc :: Ord a => [Occ a] -> Occ a
98 sumOcc xs = foldl' (unionWith (+)) empty xs
99
100