src/Gargantext/Text/Metrics.hs

   1 {-|
   2 Module      : Gargantext.Text.Metrics
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Mainly reexport functions in @Data.Text.Metrics@
  11
  12
  13 TODO
  14 noApax :: Ord a => Map a Occ -> Map a Occ
  15 noApax m = M.filter (>1) m
  16
  17 -}
  18
  19 {-# LANGUAGE BangPatterns      #-}
  20 {-# LANGUAGE NoImplicitPrelude #-}
  21 {-# LANGUAGE OverloadedStrings #-}
  22
  23 module Gargantext.Text.Metrics
  24   where
  25
  26 import Data.Ord (Down(..))
  27 import qualified Data.List as L
  28
  29 import Data.Map (Map)
  30 import qualified Data.Map  as M
  31
  32 import Data.Text (Text)
  33 import qualified Data.Text as T
  34
  35 import Data.Tuple.Extra (both)
  36 --import GHC.Real (Ratio)
  37 --import qualified Data.Text.Metrics as DTM
  38 import Data.Array.Accelerate (toList, Matrix)
  39 --import Math.KMeans (kmeans, euclidSq, elements)
  40
  41 import Gargantext.Prelude
  42 import Gargantext.Text.Metrics.Count (occurrences, cooc)
  43 import Gargantext.Text.Terms (TermType(MonoMulti), terms)
  44 import Gargantext.Core (Lang(EN))
  45 import Gargantext.Core.Types (Terms(..), Label)
  46 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
  47 import Gargantext.Text.Metrics.Count (Grouped)
  48 import Gargantext.Viz.Graph.Distances.Matrice
  49 import Gargantext.Viz.Graph.Index
  50
  51 import qualified Data.Array.Accelerate.Interpreter as DAA
  52 import qualified Data.Array.Accelerate as DAA
  53 -- import Data.Array.Accelerate ((:.)(..), Z(..))
  54
  55 import GHC.Real (round)
  56
  57 --import Debug.Trace
  58
  59 data MapListSize   = MapListSize   Int
  60 data InclusionSize = InclusionSize Int
  61 data SampleBins    = SampleBins    Double
  62 data Clusters      = Clusters      Int
  63 data DefaultValue  = DefaultValue  Int
  64
  65 data FilterConfig = FilterConfig { fc_mapListSize   :: MapListSize
  66                                  , fc_inclusionSize :: InclusionSize
  67                                  , fc_sampleBins    :: SampleBins
  68                                  , fc_clusters      :: Clusters
  69                                  , fc_defaultValue  :: DefaultValue
  70                              }
  71
  72 filterCooc :: Ord t => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
  73 filterCooc fc cc = (filterCooc' fc) ts cc
  74   where
  75     ts     = map _scored_terms $ takeSome fc $ coocScored cc
  76
  77
  78 filterCooc' :: Ord t => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
  79 filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScored " <> show (length ts)) $
  80   foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
  81     M.empty selection
  82   where
  83     selection  = [(x,y) | x <- ts
  84                         , y <- ts
  85                        -- , x >= y
  86                         ]
  87
  88
  89 -- | Map list creation
  90 -- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
  91 -- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
  92 -- each parts is then ordered by Inclusion/Exclusion
  93 -- take n scored terms in each parts where n * SampleBins = MapListSize.
  94 takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
  95 takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters _) _) scores = L.take l
  96                     $ takeSample n m
  97                     $ L.take l' $ sortWith (Down . _scored_incExc) scores
  98                     -- $ splitKmeans k scores
  99   where
 100     -- TODO: benchmark with accelerate-example kmeans version
 101     --splitKmeans x xs = L.concat $ map elements
 102     --                 $ V.take (k-1)
 103     --                 $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
 104     --                          euclidSq x xs
 105     n = round ((fromIntegral l)/s)
 106     m = round $ (fromIntegral $ length scores) / (s)
 107     takeSample n' m' xs = -- trace ("splitKmeans " <> show (length xs)) $
 108                         L.concat $ map (L.take n')
 109                                  $ map (sortWith (Down . _scored_incExc))
 110                                  -- TODO use kmeans s instead of splitEvery
 111                                  -- in order to split in s heteregenous parts
 112                                  -- without homogeneous order hypothesis
 113                                  $ splitEvery m'
 114                                  $ sortWith (Down . _scored_speGen) xs
 115
 116
 117 data Scored t = Scored { _scored_terms  :: !t
 118                        , _scored_incExc :: !InclusionExclusion
 119                        , _scored_speGen :: !SpecificityGenericity
 120                      } deriving (Show)
 121
 122 -- TODO in the textflow we end up needing these indices, it might be better
 123 -- to compute them earlier and pass them around.
 124 coocScored :: Ord t => Map (t,t) Int -> [Scored t]
 125 coocScored m = zipWith (\(_,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
 126   where
 127     (ti,fi) = createIndices m
 128     (is, ss) = incExcSpeGen $ cooc2mat ti m
 129     scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148 incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
 149 incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
 150   where
 151     (ti,fi) = createIndices m
 152     ordonne x = sortWith (Down . snd) $ zip (map snd $ M.toList fi) (toList x)
 153
 154
 155
 156 metrics_text :: Text
 157 metrics_text = T.intercalate " " metrics_sentences
 158
 159 metrics_sentences' :: [Text]
 160 metrics_sentences' = splitBy (Sentences 0) metrics_text
 161
 162 -- | Sentences
 163 metrics_sentences :: [Text]
 164 metrics_sentences = [ "There is a table with a glass of wine and a spoon."
 165                     , "I can see the glass on the table."
 166                     , "There was only a spoon on that table."
 167                     , "The glass just fall from the table, pouring wine everywhere."
 168                     , "I wish the glass did not contain wine."
 169                     ]
 170
 171 metrics_sentences_Test :: Bool
 172 metrics_sentences_Test = metrics_sentences == metrics_sentences'
 173
 174 -- | Terms reordered to visually check occurrences
 175 -- >>>
 176 {- [ [["table"],["glass"],["wine"],["spoon"]]
 177    , [["glass"],["table"]]
 178    , [["spoon"],["table"]]
 179    , [["glass"],["table"],["wine"]]
 180    , [["glass"],["wine"]]
 181    ]
 182 -}
 183
 184 metrics_terms :: IO [[Terms]]
 185 metrics_terms = mapM (terms (MonoMulti EN)) $ splitBy (Sentences 0) metrics_text
 186
 187 -- | Occurrences
 188 {-
 189 fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])]
 190          , (fromList ["object"],fromList [(["object"], 3 )])
 191          , (fromList ["glas"]  ,fromList [(["glas"]  , 2 )])
 192          , (fromList ["spoon"] ,fromList [(["spoon"] , 2 )])
 193 -}
 194 metrics_occ :: IO (Map Grouped (Map Terms Int))
 195 metrics_occ = occurrences <$> L.concat <$> metrics_terms
 196
 197 {-
 198 -- fromList [((["glas"],["object"]),6)
 199             ,((["glas"],["spoon"]),4)
 200             ,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
 201
 202 -}
 203 metrics_cooc :: IO (Map (Label, Label) Int)
 204 metrics_cooc = cooc <$> metrics_terms
 205
 206 metrics_cooc_mat :: IO (Map Label Index, Matrix Int, Matrix Double, (DAA.Vector InclusionExclusion, DAA.Vector SpecificityGenericity))
 207 metrics_cooc_mat = do
 208   m <- metrics_cooc
 209   let (ti,_) = createIndices m
 210   let mat_cooc = cooc2mat ti m
 211   pure ( ti
 212        , mat_cooc
 213        , incExcSpeGen_proba  mat_cooc
 214        , incExcSpeGen        mat_cooc
 215        )
 216
 217 metrics_incExcSpeGen :: IO ([(Label, Double)], [(Label, Double)])
 218 metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc
 219