src/Gargantext/Text/Metrics.hs

   1 {-|
   2 Module      : Gargantext.Text.Metrics
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Mainly reexport functions in @Data.Text.Metrics@
  11
  12 -}
  13
  14 {-# LANGUAGE BangPatterns      #-}
  15 {-# LANGUAGE NoImplicitPrelude #-}
  16 {-# LANGUAGE OverloadedStrings #-}
  17
  18 module Gargantext.Text.Metrics
  19   where
  20
  21 import Data.Ord (Down(..))
  22 import qualified Data.List as L
  23
  24 import Data.Map (Map)
  25 import qualified Data.Map  as M
  26
  27 --import Math.KMeans (kmeans, euclidSq, elements)
  28
  29 import Gargantext.Prelude
  30 import Gargantext.Viz.Graph.Distances.Matrice
  31 import Gargantext.Viz.Graph.Index
  32
  33 import qualified Data.Array.Accelerate.Interpreter as DAA
  34 import qualified Data.Array.Accelerate as DAA
  35 -- import Data.Array.Accelerate ((:.)(..), Z(..))
  36
  37 import GHC.Real (round)
  38
  39 import Debug.Trace (trace)
  40
  41 data MapListSize   = MapListSize   Int
  42 data InclusionSize = InclusionSize Int
  43 data SampleBins    = SampleBins    Double
  44 data Clusters      = Clusters      Int
  45 data DefaultValue  = DefaultValue  Int
  46
  47 data FilterConfig = FilterConfig { fc_mapListSize   :: MapListSize
  48                                  , fc_inclusionSize :: InclusionSize
  49                                  , fc_sampleBins    :: SampleBins
  50                                  , fc_clusters      :: Clusters
  51                                  , fc_defaultValue  :: DefaultValue
  52                              }
  53
  54 filterCooc :: (Show t, Ord t) => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
  55 filterCooc fc cc = (filterCooc' fc) ts cc
  56   where
  57     ts     = map _scored_terms $ takeSome fc $ coocScored cc
  58
  59 filterCooc' :: (Show t, Ord t) => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
  60 filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = trace ("coocScored " <> show ts) $
  61   foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
  62     M.empty selection
  63   where
  64     selection  = [(x,y) | x <- ts
  65                         , y <- ts
  66                       --  , x >= y
  67                         ]
  68
  69
  70 -- | Map list creation
  71 -- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
  72 -- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
  73 -- each parts is then ordered by Inclusion/Exclusion
  74 -- take n scored terms in each parts where n * SampleBins = MapListSize.
  75 takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
  76 takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters _) _) scores = L.take l
  77                     $ takeSample n m
  78                     $ L.take l' $ reverse $ sortWith (Down . _scored_incExc) scores
  79                     -- splitKmeans k scores
  80   where
  81     -- TODO: benchmark with accelerate-example kmeans version
  82     --splitKmeans x xs = L.concat $ map elements
  83     --                 $ V.take (k-1)
  84     --                 $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
  85     --                          euclidSq x xs
  86     n = round ((fromIntegral l)/s)
  87     m = round $ (fromIntegral $ length scores) / (s)
  88     takeSample n' m' xs = -- trace ("splitKmeans " <> show (length xs)) $
  89                         L.concat $ map (L.take n')
  90                                  $ map (sortWith (Down . _scored_incExc))
  91                                  -- TODO use kmeans s instead of splitEvery
  92                                  -- in order to split in s heteregenous parts
  93                                  -- without homogeneous order hypothesis
  94                                  $ splitEvery m'
  95                                  $ sortWith (Down . _scored_speGen) xs
  96
  97
  98 data Scored ts = Scored { _scored_terms :: !ts
  99                         , _scored_incExc :: !InclusionExclusion
 100                         , _scored_speGen :: !SpecificityGenericity
 101                         } deriving (Show)
 102
 103 -- TODO in the textflow we end up needing these indices, it might be better
 104 -- to compute them earlier and pass them around.
 105 coocScored :: Ord t => Map (t,t) Int -> [Scored t]
 106 coocScored m = zipWith (\(_,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
 107   where
 108     (ti,fi) = createIndices m
 109     (is, ss) = incExcSpeGen $ cooc2mat ti m
 110     scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)