src/Gargantext/Core/Text/List.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Ngrams.Lists
   3 Description : Tools to build lists
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 -}
  11
  12 {-# LANGUAGE ScopedTypeVariables #-}
  13 {-# LANGUAGE TemplateHaskell     #-}
  14
  15 module Gargantext.Core.Text.List
  16   where
  17
  18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
  19 import Data.HashMap.Strict (HashMap)
  20 import Data.HashSet (HashSet)
  21 import Data.Map (Map)
  22 import Data.Monoid (mempty)
  23 import Data.Ord (Down(..))
  24 import Data.Set (Set)
  25 import Data.Tuple.Extra (both)
  26 import Gargantext.API.Ngrams.Types (NgramsElement, RepoCmdM, NgramsTerm(..))
  27 import Gargantext.Core.Text (size)
  28 import Gargantext.Core.Text.List.Group
  29 import Gargantext.Core.Text.List.Group.Prelude
  30 import Gargantext.Core.Text.List.Group.WithStem
  31 import Gargantext.Core.Text.List.Social
  32 import Gargantext.Core.Text.List.Social.Prelude
  33 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
  34 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
  35 import Gargantext.Core.Types.Individu (User(..))
  36 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
  37 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
  38 import Gargantext.Database.Admin.Types.Node (NodeId)
  39 import Gargantext.Database.Prelude (CmdM)
  40 import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
  41 import Gargantext.Database.Query.Table.Node (defaultList)
  42 import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
  43 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
  44 import Gargantext.Database.Query.Tree.Error (HasTreeError)
  45 import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
  46 import Gargantext.Prelude
  47 import qualified Data.HashMap.Strict as HashMap
  48 import qualified Data.List    as List
  49 import qualified Data.Map     as Map
  50 import qualified Data.Set     as Set
  51 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
  52 import qualified Data.HashSet as HashSet
  53
  54 {-
  55 -- TODO maybe useful for later
  56 isStopTerm :: StopSize -> Text -> Bool
  57 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
  58   where
  59     isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
  60 -}
  61
  62
  63 -- | TODO improve grouping functions of Authors, Sources, Institutes..
  64 buildNgramsLists :: ( RepoCmdM env err m
  65                     , CmdM     env err m
  66                     , HasTreeError err
  67                     , HasNodeError err
  68                     )
  69                  => GroupParams
  70                  -> User
  71                  -> UserCorpusId
  72                  -> MasterCorpusId
  73                  -> m (Map NgramsType [NgramsElement])
  74 buildNgramsLists gp user uCid mCid = do
  75   ngTerms     <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
  76   othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
  77                       [ (Authors   , MapListSize 9)
  78                       , (Sources   , MapListSize 9)
  79                       , (Institutes, MapListSize 9)
  80                       ]
  81
  82   pure $ Map.unions $ [ngTerms] <> othersTerms
  83
  84
  85 data MapListSize = MapListSize { unMapListSize :: !Int }
  86
  87 buildNgramsOthersList ::( HasNodeError err
  88                         , CmdM     env err m
  89                         , RepoCmdM env err m
  90                         , HasTreeError err
  91                         )
  92                         => User
  93                         -> UserCorpusId
  94                         -> GroupParams
  95                         -> (NgramsType, MapListSize)
  96                         -> m (Map NgramsType [NgramsElement])
  97 buildNgramsOthersList user uCid _groupParams (nt, MapListSize mapListSize) = do
  98   allTerms  :: HashMap NgramsTerm (Set NodeId) <- getNodesByNgramsUser uCid nt
  99
 100   -- PrivateFirst for first developments since Public NodeMode is not implemented yet
 101   socialLists :: FlowCont NgramsTerm FlowListScores
 102     <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
 103                                                       $ HashMap.fromList
 104                                                       $ List.zip (HashMap.keys allTerms)
 105                                                                  (List.cycle [mempty])
 106                                            )
 107   let
 108     groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
 109
 110     (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
 111                            $ view flc_scores groupedWithList
 112
 113     (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm)  . viewListType) tailTerms
 114
 115     listSize = mapListSize - (List.length mapTerms)
 116     (mapTerms', candiTerms) = both HashMap.fromList
 117                             $ List.splitAt listSize
 118                             $ List.sortOn (Down . viewScore . snd)
 119                             $ HashMap.toList tailTerms'
 120
 121   pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
 122                           <> (toNgramsElement mapTerms )
 123                           <> (toNgramsElement $ setListType (Just MapTerm      ) mapTerms' )
 124                           <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
 125                           )]
 126
 127
 128 getGroupParams :: ( HasNodeError err
 129                   , CmdM     env err m
 130                   , RepoCmdM env err m
 131                   , HasTreeError err
 132                   )
 133                => GroupParams -> HashSet Ngrams -> m GroupParams
 134 getGroupParams gp@(GroupWithPosTag l a _m) ng = do
 135   hashMap <- HashMap.fromList <$> selectLems l a (HashSet.toList ng)
 136   -- printDebug "hashMap" hashMap
 137   pure $ over gwl_map (\x -> x <> hashMap) gp
 138 getGroupParams gp _ = pure gp
 139
 140
 141 -- TODO use ListIds
 142 buildNgramsTermsList :: ( HasNodeError err
 143                         , CmdM     env err m
 144                         , RepoCmdM env err m
 145                         , HasTreeError err
 146                         )
 147                         => User
 148                         -> UserCorpusId
 149                         -> MasterCorpusId
 150                         -> GroupParams
 151                         -> (NgramsType, MapListSize)
 152                         -> m (Map NgramsType [NgramsElement])
 153 buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
 154
 155 -- Filter 0 With Double
 156 -- Computing global speGen score
 157   allTerms :: HashMap NgramsTerm Double <- getTficf uCid mCid nt
 158
 159   -- PrivateFirst for first developments since Public NodeMode is not implemented yet
 160   socialLists :: FlowCont NgramsTerm FlowListScores
 161     <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
 162                                                       $ HashMap.fromList
 163                                                       $ List.zip (HashMap.keys   allTerms)
 164                                                                  (List.cycle     [mempty])
 165                                            )
 166   let ngramsKeys = HashMap.keysSet allTerms
 167
 168   groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
 169
 170   let
 171     socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
 172   --printDebug "socialLists_Stemmed" socialLists_Stemmed
 173     groupedWithList = toGroupedTree socialLists_Stemmed allTerms
 174     (stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
 175                                 $ HashMap.filter (\g -> (view gts'_score g) > 1)
 176                                 $ view flc_scores groupedWithList
 177
 178     (groupedMono, groupedMult)  = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
 179
 180   -- printDebug "stopTerms" stopTerms
 181
 182   -- splitting monterms and multiterms to take proportional candidates
 183     -- use % of list if to big, or Int if too small
 184     listSizeGlobal = 2000 :: Double
 185     monoSize = 0.4  :: Double
 186     multSize = 1 - monoSize
 187
 188     splitAt n' ns = both (HashMap.fromListWith (<>))
 189                   $ List.splitAt (round $ n' * listSizeGlobal)
 190                   $ List.sortOn (viewScore . snd)
 191                   $ HashMap.toList ns
 192
 193     (groupedMonoHead, _groupedMonoTail) = splitAt monoSize groupedMono
 194     (groupedMultHead, groupedMultTail)  = splitAt multSize groupedMult
 195
 196 -------------------------
 197 -- Filter 1 With Set NodeId and SpeGen
 198     selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
 199
 200
 201  -- TODO remove (and remove HasNodeError instance)
 202   userListId    <- defaultList uCid
 203   masterListId  <- defaultList mCid
 204
 205   mapTextDocIds <- getNodesByNgramsOnlyUser uCid
 206                                             [userListId, masterListId]
 207                                             nt
 208                                             selectedTerms
 209
 210   let
 211     groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
 212     groupedTreeScores_SetNodeId = HashMap.filter (\g -> Set.size (view gts'_score g) > 1) -- removing hapax
 213                                 $ setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
 214
 215
 216   printDebug "groupedTreeScores_SetNodeId" groupedTreeScores_SetNodeId
 217
 218   -- Coocurrences computation
 219   --, t1 >= t2 -- permute byAxis diag  -- since matrix symmetric
 220   let mapCooc = HashMap.filter (>1) -- removing cooc of 1
 221               $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
 222                            | (t1, s1) <- mapStemNodeIds
 223                            , (t2, s2) <- mapStemNodeIds
 224                            ]
 225           where
 226             mapStemNodeIds = HashMap.toList
 227                            $ HashMap.map viewScores
 228                            $ groupedTreeScores_SetNodeId
 229   let
 230     -- computing scores
 231     mapScores f = HashMap.fromList
 232                 $ map (\g -> (view scored_terms g, f g))
 233                 $ normalizeGlobal
 234                 $ map normalizeLocal
 235                 $ scored'
 236                 $ Map.fromList -- TODO remove this
 237                 $ HashMap.toList mapCooc
 238
 239   let
 240     groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
 241     groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity) groupedTreeScores_SetNodeId
 242
 243   let
 244     -- sort / partition / split
 245     -- filter mono/multi again
 246     (monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
 247
 248       -- filter with max score
 249     partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
 250                                                > (view scored_speExc $ view gts'_score g)
 251                                           )
 252
 253     (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
 254     (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
 255
 256   -- splitAt
 257   let
 258     -- use % of list if to big, or Int if to small
 259     mapSize = 1000 :: Double
 260     canSize = mapSize * 10 :: Double
 261
 262     inclSize = 0.4  :: Double
 263     exclSize = 1 - inclSize
 264
 265     splitAt' max' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * max'))
 266     sortOn   f  = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
 267
 268     monoInc_size n = splitAt' n $ monoSize * inclSize / 2
 269     multExc_size n = splitAt' n $ multSize * exclSize / 2
 270
 271
 272     (mapMonoScoredInclHead, monoScoredInclTail) = monoInc_size mapSize $ (sortOn scored_genInc) monoScoredIncl
 273     (mapMonoScoredExclHead, monoScoredExclTail) = monoInc_size mapSize $ (sortOn scored_speExc) monoScoredExcl
 274
 275     (mapMultScoredInclHead, multScoredInclTail) = multExc_size mapSize $ (sortOn scored_genInc) multScoredIncl
 276     (mapMultScoredExclHead, multScoredExclTail) = multExc_size mapSize $ (sortOn scored_speExc) multScoredExcl
 277
 278
 279     (canMonoScoredIncHead , _) = monoInc_size canSize $ (sortOn scored_genInc) monoScoredInclTail
 280     (canMonoScoredExclHead, _) = monoInc_size canSize $ (sortOn scored_speExc) monoScoredExclTail
 281
 282     (canMulScoredInclHead, _)  = multExc_size canSize $ (sortOn scored_genInc) multScoredInclTail
 283     (canMultScoredExclHead, _) = multExc_size canSize $ (sortOn scored_speExc) multScoredExclTail
 284
 285 ------------------------------------------------------------
 286     -- Final Step building the Typed list
 287     -- Candidates Terms need to be filtered
 288   let
 289     maps = setListType (Just MapTerm)
 290         $  mapMonoScoredInclHead
 291         <> mapMonoScoredExclHead
 292         <> mapMultScoredInclHead
 293         <> mapMultScoredExclHead
 294
 295     -- An original way to filter to start with
 296     cands = setListType (Just CandidateTerm)
 297           $ canMonoScoredIncHead
 298           <> canMonoScoredExclHead
 299           <> canMulScoredInclHead
 300           <> canMultScoredExclHead
 301
 302   -- TODO count it too
 303     cands' = setListType (Just CandidateTerm)
 304           {-\$  groupedMonoTail
 305           <>-} groupedMultTail
 306
 307     result = Map.unionsWith (<>)
 308        [ Map.fromList [( nt, toNgramsElement maps
 309                           <> toNgramsElement cands
 310                           <> toNgramsElement cands'
 311                           <> toNgramsElement stopTerms
 312                       )]
 313        ]
 314
 315   pure result