src/Gargantext/Core/Text/List.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Ngrams.Lists
   3 Description : Tools to build lists
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 -}
  11
  12 {-# LANGUAGE ScopedTypeVariables #-}
  13 {-# LANGUAGE TemplateHaskell     #-}
  14
  15 module Gargantext.Core.Text.List
  16   where
  17
  18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
  19 import Data.HashMap.Strict (HashMap)
  20 import Data.HashSet (HashSet)
  21 import Data.Map (Map)
  22 import Data.Monoid (mempty)
  23 import Data.Ord (Down(..))
  24 import Data.Set (Set)
  25 import Data.Tuple.Extra (both)
  26 import Gargantext.API.Ngrams.Types (NgramsElement, RepoCmdM, NgramsTerm(..))
  27 import Gargantext.Core.Text (size)
  28 import Gargantext.Core.Text.List.Group
  29 import Gargantext.Core.Text.List.Group.Prelude
  30 import Gargantext.Core.Text.List.Group.WithStem
  31 import Gargantext.Core.Text.List.Social
  32 import Gargantext.Core.Text.List.Social.Prelude
  33 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
  34 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
  35 import Gargantext.Core.Types.Individu (User(..))
  36 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
  37 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
  38 import Gargantext.Database.Admin.Types.Node (NodeId)
  39 import Gargantext.Database.Prelude (CmdM)
  40 import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
  41 import Gargantext.Database.Query.Table.Node (defaultList)
  42 import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
  43 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
  44 import Gargantext.Database.Query.Tree.Error (HasTreeError)
  45 import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
  46 import Gargantext.Prelude
  47 import qualified Data.HashMap.Strict as HashMap
  48 import qualified Data.List    as List
  49 import qualified Data.Map     as Map
  50 import qualified Data.Set     as Set
  51 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
  52 import qualified Data.HashSet as HashSet
  53
  54 {-
  55 -- TODO maybe useful for later
  56 isStopTerm :: StopSize -> Text -> Bool
  57 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
  58   where
  59     isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
  60 -}
  61
  62
  63 -- | TODO improve grouping functions of Authors, Sources, Institutes..
  64 buildNgramsLists :: ( RepoCmdM env err m
  65                     , CmdM     env err m
  66                     , HasTreeError err
  67                     , HasNodeError err
  68                     )
  69                  => GroupParams
  70                  -> User
  71                  -> UserCorpusId
  72                  -> MasterCorpusId
  73                  -> m (Map NgramsType [NgramsElement])
  74 buildNgramsLists gp user uCid mCid = do
  75   ngTerms     <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
  76   othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
  77                       [ (Authors   , MapListSize 9)
  78                       , (Sources   , MapListSize 9)
  79                       , (Institutes, MapListSize 9)
  80                       ]
  81
  82   pure $ Map.unions $ [ngTerms] <> othersTerms
  83
  84
  85 data MapListSize = MapListSize { unMapListSize :: !Int }
  86
  87 buildNgramsOthersList ::( HasNodeError err
  88                         , CmdM     env err m
  89                         , RepoCmdM env err m
  90                         , HasTreeError err
  91                         )
  92                         => User
  93                         -> UserCorpusId
  94                         -> GroupParams
  95                         -> (NgramsType, MapListSize)
  96                         -> m (Map NgramsType [NgramsElement])
  97 buildNgramsOthersList user uCid _groupParams (nt, MapListSize mapListSize) = do
  98   allTerms  :: HashMap NgramsTerm (Set NodeId) <- getNodesByNgramsUser uCid nt
  99
 100   -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
 101   socialLists :: FlowCont NgramsTerm FlowListScores
 102     <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
 103                                                       $ HashMap.fromList
 104                                                       $ List.zip (HashMap.keys allTerms)
 105                                                                  (List.cycle [mempty])
 106                                            )
 107 {-
 108   if nt == Sources -- Authors
 109      then printDebug "flowSocialList" socialLists
 110      else printDebug "flowSocialList" ""
 111 -}
 112   let
 113     groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
 114 {-
 115   if nt == Sources -- Authors
 116      then printDebug "groupedWithList" groupedWithList
 117      else printDebug "groupedWithList" ""
 118 -}
 119
 120   let
 121     (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
 122                            $ view flc_scores groupedWithList
 123
 124     (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm)  . viewListType) tailTerms
 125
 126     listSize = mapListSize - (List.length mapTerms)
 127     (mapTerms', candiTerms) = both HashMap.fromList
 128                             $ List.splitAt listSize
 129                             $ List.sortOn (Down . viewScore . snd)
 130                             $ HashMap.toList tailTerms'
 131
 132   pure $ Map.fromList [( nt,  (toNgramsElement stopTerms)
 133                              <> (toNgramsElement mapTerms )
 134                              <> (toNgramsElement $ setListType (Just MapTerm      ) mapTerms' )
 135                              <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
 136                           )]
 137
 138
 139 getGroupParams :: ( HasNodeError err
 140                   , CmdM     env err m
 141                   , RepoCmdM env err m
 142                   , HasTreeError err
 143                   )
 144                => GroupParams -> HashSet Ngrams -> m GroupParams
 145 getGroupParams gp@(GroupWithPosTag l a _m) ng = do
 146   hashMap <- HashMap.fromList <$> selectLems l a (HashSet.toList ng)
 147   -- printDebug "hashMap" hashMap
 148   pure $ over gwl_map (\x -> x <> hashMap) gp
 149 getGroupParams gp _ = pure gp
 150
 151
 152
 153
 154 -- TODO use ListIds
 155 buildNgramsTermsList :: ( HasNodeError err
 156                         , CmdM     env err m
 157                         , RepoCmdM env err m
 158                         , HasTreeError err
 159                         )
 160                         => User
 161                         -> UserCorpusId
 162                         -> MasterCorpusId
 163                         -> GroupParams
 164                         -> (NgramsType, MapListSize)
 165                         -> m (Map NgramsType [NgramsElement])
 166 buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
 167
 168 -- | Filter 0 With Double
 169 -- Computing global speGen score
 170   allTerms :: HashMap NgramsTerm Double <- getTficf uCid mCid nt
 171
 172   -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
 173   socialLists :: FlowCont NgramsTerm FlowListScores
 174     <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
 175                                                       $ HashMap.fromList
 176                                                       $ List.zip (HashMap.keys   allTerms)
 177                                                                  (List.cycle     [mempty])
 178                                            )
 179   let ngramsKeys = HashMap.keysSet allTerms
 180
 181   groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
 182   let socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
 183   --printDebug "socialLists_Stemmed" socialLists_Stemmed
 184   let groupedWithList = toGroupedTree socialLists_Stemmed allTerms
 185       (stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
 186                                   $ view flc_scores groupedWithList
 187
 188       (groupedMono, groupedMult)  = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
 189
 190   -- printDebug "stopTerms" stopTerms
 191
 192   -- splitting monterms and multiterms to take proportional candidates
 193   let
 194     -- use % of list if to big, or Int if too small
 195     listSizeGlobal = 2000 :: Double
 196     monoSize = 0.4  :: Double
 197     multSize = 1 - monoSize
 198
 199     splitAt n' ns = both (HashMap.fromListWith (<>))
 200                   $ List.splitAt (round $ n' * listSizeGlobal)
 201                   $ List.sortOn (viewScore . snd)
 202                   $ HashMap.toList ns
 203
 204     (groupedMonoHead, _groupedMonoTail) = splitAt monoSize groupedMono
 205     (groupedMultHead, groupedMultTail)  = splitAt multSize groupedMult
 206
 207 -------------------------
 208 -- Filter 1 With Set NodeId and SpeGen
 209     selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
 210
 211
 212  -- TO remove (and remove HasNodeError instance)
 213   userListId    <- defaultList uCid
 214   masterListId  <- defaultList mCid
 215
 216
 217   mapTextDocIds <- getNodesByNgramsOnlyUser uCid
 218                                             [userListId, masterListId]
 219                                             nt
 220                                             selectedTerms
 221
 222   let
 223     groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
 224     groupedTreeScores_SetNodeId = setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
 225
 226   -- | Coocurrences computation
 227   --, t1 >= t2 -- permute byAxis diag  -- since matrix symmetric
 228   let mapCooc = HashMap.filter (>2)
 229               $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
 230                            | (t1, s1) <- mapStemNodeIds
 231                            , (t2, s2) <- mapStemNodeIds
 232                            ]
 233           where
 234             mapStemNodeIds = HashMap.toList
 235                            $ HashMap.map viewScores
 236                            $ groupedTreeScores_SetNodeId
 237   let
 238     -- computing scores
 239     mapScores f = HashMap.fromList
 240                 $ map (\g -> (view scored_terms g, f g))
 241                 $ normalizeGlobal
 242                 $ map normalizeLocal
 243                 $ scored'
 244                 $ Map.fromList -- TODO remove this
 245                 $ HashMap.toList mapCooc
 246
 247   let
 248     groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
 249     groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity)
 250                                                 (  groupedMonoHead
 251                                                 <> groupedMultHead
 252                                                 )
 253
 254   let
 255     -- sort / partition / split
 256     -- filter mono/multi again
 257     (monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
 258
 259       -- filter with max score
 260     partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
 261                                                > (view scored_speExc $ view gts'_score g)
 262                                           )
 263
 264     (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
 265     (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
 266
 267   -- splitAt
 268   let
 269     -- use % of list if to big, or Int if to small
 270     listSizeLocal = 1000 :: Double
 271     inclSize = 0.4  :: Double
 272     exclSize = 1 - inclSize
 273
 274     splitAt' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * listSizeLocal))
 275     sortOn   f  = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
 276
 277
 278     monoInc_size = splitAt' $ monoSize * inclSize / 2
 279     (monoScoredInclHead, _monoScoredInclTail) = monoInc_size $ (sortOn scored_genInc) monoScoredIncl
 280     (monoScoredExclHead, _monoScoredExclTail) = monoInc_size $ (sortOn scored_speExc) monoScoredExcl
 281
 282     multExc_size = splitAt' $ multSize * exclSize / 2
 283     (multScoredInclHead, multScoredInclTail) = multExc_size $ (sortOn scored_genInc) multScoredIncl
 284     (multScoredExclHead, multScoredExclTail) = multExc_size $ (sortOn scored_speExc) multScoredExcl
 285
 286 ------------------------------------------------------------
 287     -- Final Step building the Typed list
 288     -- Candidates Terms need to be filtered
 289   let
 290     maps = setListType (Just MapTerm)
 291         $  monoScoredInclHead
 292         <> monoScoredExclHead
 293         <> multScoredInclHead
 294         <> multScoredExclHead
 295
 296     -- An original way to filter to start with
 297     cands = setListType (Just CandidateTerm)
 298           $  {- monoScoredInclTail
 299           <> monoScoredExclTail
 300           <> -} multScoredInclTail
 301           <>  multScoredExclTail
 302
 303     cands' = setListType (Just CandidateTerm)
 304           {-$  groupedMonoTail
 305           <>-} groupedMultTail
 306
 307     result = Map.unionsWith (<>)
 308        [ Map.fromList [( nt, toNgramsElement maps
 309                           <> toNgramsElement cands
 310                           <> toNgramsElement cands'
 311                           <> toNgramsElement stopTerms
 312                       )]
 313        ]
 314
 315   -- printDebug "result" result
 316
 317   pure result