src/Gargantext/Text/Metrics.hs

   1 {-|
   2 Module      : Gargantext.Text.Metrics
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Mainly reexport functions in @Data.Text.Metrics@
  11
  12
  13 TODO
  14 noApax :: Ord a => Map a Occ -> Map a Occ
  15 noApax m = M.filter (>1) m
  16
  17 -}
  18
  19 {-# LANGUAGE BangPatterns      #-}
  20 {-# LANGUAGE NoImplicitPrelude #-}
  21 {-# LANGUAGE OverloadedStrings #-}
  22
  23 module Gargantext.Text.Metrics
  24   where
  25
  26 import Data.Text (Text, pack)
  27 import Data.Ord (comparing, Down(..))
  28 import Data.Map (Map)
  29 import qualified Data.List as L
  30 import qualified Data.Map  as M
  31 import qualified Data.Set  as S
  32 import qualified Data.Text as T
  33 import qualified Data.Vector as V
  34 import qualified Data.Vector.Unboxed as VU
  35 import Data.Tuple.Extra (both)
  36 --import GHC.Real (Ratio)
  37 --import qualified Data.Text.Metrics as DTM
  38 import Data.Array.Accelerate (toList)
  39 import Math.KMeans (kmeans, euclidSq, elements)
  40
  41
  42 import Gargantext.Prelude
  43
  44 import Gargantext.Text.Metrics.Count (occurrences, cooc)
  45 import Gargantext.Text.Terms (TermType(MonoMulti), terms)
  46 import Gargantext.Core (Lang(EN))
  47 import Gargantext.Core.Types (Terms(..))
  48 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
  49
  50 import Gargantext.Viz.Graph.Distances.Matrice
  51 import Gargantext.Viz.Graph.Index
  52
  53 import qualified Data.Array.Accelerate.Interpreter as DAA
  54 import qualified Data.Array.Accelerate as DAA
  55 -- import Data.Array.Accelerate ((:.)(..), Z(..))
  56
  57 import GHC.Real (round)
  58
  59 import Debug.Trace
  60 import Prelude (seq)
  61
  62 data MapListSize   = MapListSize   Int
  63 data InclusionSize = InclusionSize Int
  64 data SampleBins    = SampleBins    Double
  65 data Clusters      = Clusters      Int
  66 data DefaultValue  = DefaultValue  Int
  67
  68 data FilterConfig = FilterConfig { fc_mapListSize   :: MapListSize
  69                                  , fc_inclusionSize :: InclusionSize
  70                                  , fc_sampleBins    :: SampleBins
  71                                  , fc_clusters      :: Clusters
  72                                  , fc_defaultValue  :: DefaultValue
  73                              }
  74
  75 filterCooc :: Ord t => FilterConfig -> Map (t, t) Int -> Map (t, t) Int
  76 filterCooc fc cc = (filterCooc' fc) ts cc
  77   where
  78     ts     = map _scored_terms $ takeSome fc $ coocScored cc
  79
  80
  81 filterCooc' :: Ord t => FilterConfig -> [t] -> Map (t, t) Int -> Map (t, t) Int
  82 filterCooc' (FilterConfig _ _ _ _ (DefaultValue dv)) ts m = -- trace ("coocScored " <> show (length ts)) $
  83   foldl' (\m' k -> M.insert k (maybe dv identity $ M.lookup k m) m')
  84     M.empty selection
  85   where
  86     selection  = [(x,y) | x <- ts
  87                         , y <- ts
  88                        -- , x >= y
  89                         ]
  90
  91
  92 -- | Map list creation
  93 -- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
  94 -- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
  95 -- each parts is then ordered by Inclusion/Exclusion
  96 -- take n scored terms in each parts where n * SampleBins = MapListSize.
  97 takeSome :: Ord t => FilterConfig -> [Scored t] -> [Scored t]
  98 takeSome (FilterConfig (MapListSize l) (InclusionSize l') (SampleBins s) (Clusters k) _) scores = L.take l
  99                     $ takeSample n m
 100                     $ L.take l' $ sortWith (Down . _scored_incExc) scores
 101                     -- $ splitKmeans k scores
 102   where
 103     -- TODO: benchmark with accelerate-example kmeans version
 104     splitKmeans x xs = L.concat $ map elements
 105                      $ V.take (k-1)
 106                      $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
 107                               euclidSq x xs
 108     n = round ((fromIntegral l)/s)
 109     m = round $ (fromIntegral $ length scores) / (s)
 110     takeSample n m xs = -- trace ("splitKmeans " <> show (length xs)) $
 111                         L.concat $ map (L.take n)
 112                                  $ map (sortWith (Down . _scored_incExc))
 113                                  -- TODO use kmeans s instead of splitEvery
 114                                  -- in order to split in s heteregenous parts
 115                                  -- without homogeneous order hypothesis
 116                                  $ splitEvery m
 117                                  $ sortWith (Down . _scored_speGen) xs
 118
 119
 120 data Scored t = Scored { _scored_terms  :: !t
 121                        , _scored_incExc :: !InclusionExclusion
 122                        , _scored_speGen :: !SpecificityGenericity
 123                      } deriving (Show)
 124
 125 coocScored :: Ord t => Map (t,t) Int -> [Scored t]
 126 coocScored m = zipWith (\(i,t) (inc,spe) -> Scored t inc spe) (M.toList fi) scores
 127   where
 128     (ti,fi) = createIndices m
 129     (is, ss) = incExcSpeGen $ cooc2mat ti m
 130     scores = DAA.toList $ DAA.run $ DAA.zip (DAA.use is) (DAA.use ss)
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149 incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
 150 incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
 151   where
 152     (ti,fi) = createIndices m
 153     ordonne x = sortWith (Down . snd) $ zip (map snd $ M.toList fi) (toList x)
 154
 155
 156
 157 metrics_text :: Text
 158 metrics_text = T.intercalate " " metrics_sentences
 159
 160 metrics_sentences' :: [Text]
 161 metrics_sentences' = splitBy (Sentences 0) metrics_text
 162
 163 -- | Sentences
 164 metrics_sentences :: [Text]
 165 metrics_sentences = [ "There is a table with a glass of wine and a spoon."
 166                     , "I can see the glass on the table."
 167                     , "There was only a spoon on that table."
 168                     , "The glass just fall from the table, pouring wine everywhere."
 169                     , "I wish the glass did not contain wine."
 170                     ]
 171
 172 metrics_sentences_Test = metrics_sentences == metrics_sentences'
 173
 174 -- | Terms reordered to visually check occurrences
 175 -- >>>
 176 {- [ [["table"],["glass"],["wine"],["spoon"]]
 177    , [["glass"],["table"]]
 178    , [["spoon"],["table"]]
 179    , [["glass"],["table"],["wine"]]
 180    , [["glass"],["wine"]]
 181    ]
 182 -}
 183
 184 metrics_terms :: IO [[Terms]]
 185 metrics_terms = mapM (terms (MonoMulti EN)) $ splitBy (Sentences 0) metrics_text
 186
 187 -- | Occurrences
 188 {-
 189 fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])]
 190          , (fromList ["object"],fromList [(["object"], 3 )])
 191          , (fromList ["glas"]  ,fromList [(["glas"]  , 2 )])
 192          , (fromList ["spoon"] ,fromList [(["spoon"] , 2 )])
 193 -}
 194 metrics_occ = occurrences <$> L.concat <$> metrics_terms
 195
 196 {-
 197 -- fromList [((["glas"],["object"]),6)
 198             ,((["glas"],["spoon"]),4)
 199             ,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
 200
 201 -}
 202 metrics_cooc = cooc <$> metrics_terms
 203
 204 metrics_cooc_mat = do
 205   m <- metrics_cooc
 206   let (ti,_) = createIndices m
 207   let mat_cooc = cooc2mat ti m
 208   pure ( ti
 209        , mat_cooc
 210        , incExcSpeGen_proba  mat_cooc
 211        , incExcSpeGen        mat_cooc
 212        )
 213
 214 metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc
 215