src/Gargantext/Text/Metrics.hs

   1 {-|
   2 Module      : Gargantext.Text.Metrics
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Mainly reexport functions in @Data.Text.Metrics@
  11
  12
  13 TODO
  14 noApax :: Ord a => Map a Occ -> Map a Occ
  15 noApax m = M.filter (>1) m
  16
  17 -}
  18
  19 {-# LANGUAGE NoImplicitPrelude #-}
  20 {-# LANGUAGE OverloadedStrings #-}
  21
  22 module Gargantext.Text.Metrics
  23   where
  24
  25 import Data.Text (Text, pack)
  26 import Data.Map (Map)
  27 import qualified Data.List as L
  28 import qualified Data.Map  as M
  29 import qualified Data.Set  as S
  30 import qualified Data.Text as T
  31 import qualified Data.Vector as V
  32 import qualified Data.Vector.Unboxed as VU
  33 import Data.Tuple.Extra (both)
  34 --import GHC.Real (Ratio)
  35 --import qualified Data.Text.Metrics as DTM
  36 import Data.Array.Accelerate (toList)
  37 import Math.KMeans (kmeans, euclidSq, elements)
  38
  39
  40 import Gargantext.Prelude
  41
  42 import Gargantext.Text.Metrics.Count (occurrences, cooc)
  43 import Gargantext.Text.Terms (TermType(MonoMulti), terms)
  44 import Gargantext.Core (Lang(EN))
  45 import Gargantext.Core.Types (Terms(..))
  46 import Gargantext.Text.Context (splitBy, SplitContext(Sentences))
  47
  48 import Gargantext.Viz.Graph.Distances.Matrice
  49 import Gargantext.Viz.Graph.Index
  50
  51 import qualified Data.Array.Accelerate.Interpreter as DAA
  52 import qualified Data.Array.Accelerate as DAA
  53
  54 import GHC.Real (round)
  55
  56 --filterCooc :: Ord t => Map (t, t) Int -> Map (t, t) Int
  57 --filterCooc m =
  58 ---- filterCooc m = foldl (\k -> maybe (panic "no key") identity $ M.lookup k m) M.empty selection
  59 ----(ti, fi)  = createIndices m
  60 -- . fromIndex fi $ filterMat $ cooc2mat ti m
  61
  62
  63 type MapListSize  = Int
  64 type SampleBins = Double
  65 type Clusters = Int
  66
  67 -- | Map list creation
  68 -- Kmeans split into (Clusters::Int) main clusters with Inclusion/Exclusion (relevance score)
  69 -- Sample the main cluster ordered by specificity/genericity in (SampleBins::Double) parts
  70 -- each parts is then ordered by Inclusion/Exclusion
  71 -- take n scored terms in each parts where n * SampleBins = MapListSize.
  72 takeSome :: Ord t => MapListSize -> SampleBins -> Clusters -> [Scored t] -> [Scored t]
  73 takeSome l s k scores = L.take l
  74                     $ takeSample n m
  75                     $ splitKmeans k scores
  76   where
  77     -- TODO: benchmark with accelerate-example kmeans version
  78     splitKmeans x xs = elements
  79                      $ V.head
  80                      $ kmeans (\i -> VU.fromList ([(_scored_incExc i :: Double)]))
  81                               euclidSq x xs
  82     n = round ((fromIntegral l)/s)
  83     m = round $ (fromIntegral $ length scores) / (s)
  84     takeSample n m xs = L.concat $ map (L.take n)
  85                                  $ L.reverse $ map (L.sortOn _scored_incExc)
  86                                  $ splitEvery m
  87                                  $ L.reverse $ L.sortOn _scored_speGen xs
  88
  89
  90 data Scored t = Scored { _scored_terms  :: t
  91                        , _scored_incExc :: InclusionExclusion
  92                        , _scored_speGen :: SpecificityGenericity
  93                      } deriving (Show)
  94
  95 incExcSpeGen_sorted' :: (DAA.Elt t, Ord t) => Map (t,t) Int -> [Scored t]
  96 incExcSpeGen_sorted' m = map (\(t,inc,spe) -> Scored t inc spe) scores
  97   where
  98     (ti,fi) = createIndices m
  99     (is, ss) = incExcSpeGen $ cooc2mat ti m
 100     scores = DAA.toList $ DAA.run $ DAA.zip3 (DAA.use ts) (DAA.use is) (DAA.use ss)
 101     ts = DAA.fromList (DAA.arrayShape is) (snd <$> M.toAscList fi)
 102     -- TODO fi should already be a Vector
 103
 104
 105 incExcSpeGen_sorted :: Ord t => Map (t,t) Int -> ([(t,Double)],[(t,Double)])
 106 incExcSpeGen_sorted m = both ordonne (incExcSpeGen $ cooc2mat ti m)
 107   where
 108     (ti,fi) = createIndices m
 109     ordonne x = L.reverse $ L.sortOn snd $ zip (map snd $ M.toList fi) (toList x)
 110
 111
 112
 113
 114 metrics_text :: Text
 115 metrics_text = T.intercalate " " metrics_sentences
 116
 117 metrics_sentences' :: [Text]
 118 metrics_sentences' = splitBy (Sentences 0) metrics_text
 119
 120 -- | Sentences
 121 metrics_sentences :: [Text]
 122 metrics_sentences = [ "There is a table with a glass of wine and a spoon."
 123                     , "I can see the glass on the table."
 124                     , "There was only a spoon on that table."
 125                     , "The glass just fall from the table, pouring wine everywhere."
 126                     , "I wish the glass did not contain wine."
 127                     ]
 128
 129 metrics_sentences_Test = metrics_sentences == metrics_sentences'
 130
 131 -- | Terms reordered to visually check occurrences
 132 -- >>>
 133 {- [ [["table"],["glass"],["wine"],["spoon"]]
 134    , [["glass"],["table"]]
 135    , [["spoon"],["table"]]
 136    , [["glass"],["table"],["wine"]]
 137    , [["glass"],["wine"]]
 138    ]
 139 -}
 140
 141 metrics_terms :: IO [[Terms]]
 142 metrics_terms = mapM (terms MonoMulti EN) $ splitBy (Sentences 0) metrics_text
 143
 144 -- | Occurrences
 145 {-
 146 fromList [ (fromList ["table"] ,fromList [(["table"] , 3 )])]
 147          , (fromList ["object"],fromList [(["object"], 3 )])
 148          , (fromList ["glas"]  ,fromList [(["glas"]  , 2 )])
 149          , (fromList ["spoon"] ,fromList [(["spoon"] , 2 )])
 150 -}
 151 metrics_occ = occurrences <$> L.concat <$> metrics_terms
 152
 153 {-
 154 -- fromList [((["glas"],["object"]),6)
 155             ,((["glas"],["spoon"]),4)
 156             ,((["glas"],["table"]),6),((["object"],["spoon"]),6),((["object"],["table"]),9),((["spoon"],["table"]),6)]
 157
 158 -}
 159 metrics_cooc = cooc <$> metrics_terms
 160
 161 metrics_cooc_mat = do
 162   m <- metrics_cooc
 163   let (ti,_) = createIndices m
 164   let mat_cooc = cooc2mat ti m
 165   pure ( ti
 166        , mat_cooc
 167        , incExcSpeGen_proba  mat_cooc
 168        , incExcSpeGen        mat_cooc
 169        )
 170
 171 metrics_incExcSpeGen = incExcSpeGen_sorted <$> metrics_cooc
 172