src/Gargantext/Core/Text/List.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Ngrams.Lists
   3 Description : Tools to build lists
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 -}
  11
  12 {-# LANGUAGE TemplateHaskell   #-}
  13
  14 module Gargantext.Core.Text.List
  15   where
  16
  17 import Control.Lens (makeLenses)
  18 import Data.Maybe (fromMaybe, catMaybes)
  19 import Data.Ord (Down(..))
  20 import Data.Map (Map)
  21 import Data.Set (Set)
  22 import Data.Text (Text)
  23 import qualified Data.Char as Char
  24 import qualified Data.List as List
  25 import qualified Data.Map  as Map
  26 import qualified Data.Set  as Set
  27 import qualified Data.Text as Text
  28
  29 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
  30 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
  31 import Gargantext.Core (Lang(..))
  32 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
  33 import Gargantext.Database.Admin.Types.Node (NodeId)
  34 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
  35 import Gargantext.Database.Action.Metrics.NgramsByNode (ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
  36 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
  37 import Gargantext.Database.Query.Table.Node (defaultList)
  38 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
  39 import Gargantext.Database.Prelude (Cmd)
  40 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
  41
  42 import Gargantext.Prelude
  43 import Gargantext.Core.Text (size)
  44 import Gargantext.Core.Text.List.Learn (Model(..))
  45 -- import Gargantext.Core.Text.Metrics (takeScored)
  46
  47
  48 data NgramsListBuilder = BuilderStepO { stemSize :: !Int
  49                                       , stemX    :: !Int
  50                                       , stopSize :: !Int
  51                                       }
  52                        | BuilderStep1 { withModel :: !Model }
  53                        | BuilderStepN { withModel :: !Model }
  54                        | Tficf { nlb_lang           :: !Lang
  55                                , nlb_group1         :: !Int
  56                                , nlb_group2         :: !Int
  57                                , nlb_stopSize       :: !StopSize
  58                                , nlb_userCorpusId   :: !UserCorpusId
  59                                , nlb_masterCorpusId :: !MasterCorpusId
  60                                }
  61
  62
  63 data StopSize = StopSize {unStopSize :: !Int}
  64
  65 -- | TODO improve grouping functions of Authors, Sources, Institutes..
  66 buildNgramsLists :: HasNodeError err
  67                  => Lang
  68                  -> Int
  69                  -> Int
  70                  -> StopSize
  71                  -> UserCorpusId
  72                  -> MasterCorpusId
  73                  -> Cmd err (Map NgramsType [NgramsElement])
  74 buildNgramsLists l n m s uCid mCid = do
  75   ngTerms     <- buildNgramsTermsList l n m s uCid mCid
  76   othersTerms <- mapM (buildNgramsOthersList uCid identity)
  77                       [Authors, Sources, Institutes]
  78   pure $ Map.unions $ othersTerms <> [ngTerms]
  79
  80
  81 buildNgramsOthersList :: UserCorpusId
  82                       -> (Text -> Text)
  83                       -> NgramsType
  84                       -> Cmd err (Map NgramsType [NgramsElement])
  85 buildNgramsOthersList uCid groupIt nt = do
  86   ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
  87
  88   let
  89     listSize = 9
  90     all'     = List.reverse
  91              $ List.sortOn (Set.size . snd . snd)
  92              $ Map.toList ngs
  93
  94     graphTerms = List.take listSize all'
  95     candiTerms = List.drop listSize all'
  96
  97   pure $ Map.unionsWith (<>) [ toElements MapTerm     graphTerms
  98                              , toElements CandidateTerm candiTerms
  99                              ]
 100     where
 101       toElements nType x =
 102         Map.fromList [(nt, [ mkNgramsElement (NgramsTerm t) nType Nothing (mSetFromList [])
 103                            | (t, _ns) <- x
 104                            ]
 105                      )]
 106
 107 -- TODO use ListIds
 108 buildNgramsTermsList :: HasNodeError err
 109                      => Lang
 110                      -> Int
 111                      -> Int
 112                      -> StopSize
 113                      -> UserCorpusId
 114                      -> MasterCorpusId
 115                      -> Cmd err (Map NgramsType [NgramsElement])
 116 buildNgramsTermsList l n m s uCid mCid = do
 117
 118 -- Computing global speGen score
 119   allTerms <- Map.toList <$> getTficf uCid mCid NgramsTerms
 120
 121   -- printDebug "head candidates" (List.take 10 $ allTerms)
 122   -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
 123
 124   -- First remove stops terms
 125   let
 126     -- stopTerms ignored for now (need to be tagged already)
 127     (_stopTerms, candidateTerms) = List.partition ((isStopTerm s) . fst) allTerms
 128
 129   -- Grouping the ngrams and keeping the maximum score for label
 130   let grouped = groupStems'
 131         $ map (\(t,d) -> let stem = ngramsGroup l n m t
 132                           in ( stem
 133                              , GroupedText Nothing t d Set.empty (size t) stem Set.empty
 134                              )
 135               ) candidateTerms
 136
 137       (groupedMono, groupedMult) = Map.partition (\gt -> _gt_size gt < 2) grouped
 138
 139 -- printDebug "groupedMult" groupedMult
 140   -- splitting monterms and multiterms to take proportional candidates
 141   let
 142     listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if to small
 143     monoSize = 0.4  :: Double
 144     multSize = 1 - monoSize
 145
 146     splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
 147
 148     (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
 149     (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
 150
 151   printDebug "groupedMonoHead" (List.length groupedMonoHead)
 152   printDebug "groupedMonoTail" (List.length groupedMonoHead)
 153   printDebug "groupedMultHead" (List.length groupedMultHead)
 154   printDebug "groupedMultTail" (List.length groupedMultTail)
 155
 156   let
 157     -- Get Local Scores now for selected grouped ngrams
 158     selectedTerms = Set.toList $ List.foldl'
 159                       (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
 160                                                             $ Set.union g
 161                                                             $ Set.singleton l'
 162                       )
 163                       Set.empty
 164                       (groupedMonoHead <> groupedMultHead)
 165
 166   -- TO remove (and remove HasNodeError instance)
 167   userListId    <- defaultList uCid
 168   masterListId  <- defaultList mCid
 169
 170   mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
 171   let
 172     mapGroups   = Map.fromList
 173                 $ map (\g -> (_gt_stem g, g))
 174                 $ groupedMonoHead <> groupedMultHead
 175
 176     -- grouping with Set NodeId
 177     contextsAdded = foldl' (\mapGroups' k -> let k' = ngramsGroup l n m k
 178                                     in case Map.lookup k' mapGroups'  of
 179                                       Nothing -> mapGroups'
 180                                       Just g  -> case Map.lookup k mapTextDocIds of
 181                                                    Nothing -> mapGroups'
 182                                                    Just ns -> Map.insert k' ( g { _gt_nodes = Set.union ns (_gt_nodes g)}) mapGroups'
 183                            )
 184                   mapGroups
 185                   $ Map.keys mapTextDocIds
 186
 187     -- compute cooccurrences
 188     mapCooc = Map.filter (>2) $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
 189                            | (t1, s1) <- mapStemNodeIds
 190                            , (t2, s2) <- mapStemNodeIds
 191                            ]
 192       where
 193         mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
 194   -- printDebug "mapCooc" mapCooc
 195
 196   let
 197     -- computing scores
 198     mapScores f = Map.fromList
 199                 $ map (\(Scored t g s') -> (t, f (g,s')))
 200                 $ normalizeGlobal
 201                 $ map normalizeLocal
 202                 $ scored' mapCooc
 203
 204     groupsWithScores = catMaybes
 205                      $ map (\(stem, g)
 206                              -> case Map.lookup stem mapScores' of
 207                                  Nothing -> Nothing
 208                                  Just s'  -> Just $ g { _gt_score = s'}
 209                             ) $ Map.toList contextsAdded
 210       where
 211         mapScores' = mapScores identity
 212         -- adapt2 TOCHECK with DC
 213   -- printDebug "groupsWithScores" groupsWithScores
 214   let
 215     -- sort / partition / split
 216       -- filter mono/multi again
 217     (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
 218       -- filter with max score
 219     partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
 220
 221     (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
 222     (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
 223
 224       -- splitAt
 225   let
 226     listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
 227     inclSize = 0.4  :: Double
 228     exclSize = 1 - inclSize
 229     splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
 230
 231     (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
 232     (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
 233
 234     (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
 235     (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
 236
 237
 238     -- Final Step building the Typed list
 239     -- (map (toGargList $ Just StopTerm) stopTerms) -- Removing stops (needs social score)
 240     termListHead =
 241              (map (\g -> g { _gt_listType = Just MapTerm} )  (  monoScoredInclHead
 242                                                  <> monoScoredExclHead
 243                                                  <> multScoredInclHead
 244                                                  <> multScoredExclHead
 245                                                  )
 246                )
 247             <> (map (\g -> g { _gt_listType = Just CandidateTerm }) (  monoScoredInclTail
 248                                                       <> monoScoredExclTail
 249                                                       <> multScoredInclTail
 250                                                       <> multScoredExclTail
 251                                                       )
 252                )
 253
 254     termListTail = map (\g -> g { _gt_listType = Just CandidateTerm }) ( groupedMonoTail <> groupedMultTail)
 255
 256 --  printDebug "monoScoredInclHead" monoScoredInclHead
 257 --  printDebug "monoScoredExclHead" monoScoredExclTail
 258 --
 259   printDebug "multScoredInclHead" multScoredInclHead
 260   printDebug "multScoredExclTail" multScoredExclTail
 261
 262
 263
 264   pure $ Map.fromList [(NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
 265                                   <> (List.concat $ map toNgramsElement $ termListTail)
 266                        )
 267                       ]
 268
 269 groupStems :: [(Stem, GroupedText Double)] -> [GroupedText Double]
 270 groupStems = Map.elems . groupStems'
 271
 272 groupStems' :: [(Stem, GroupedText Double)] -> Map Stem (GroupedText Double)
 273 groupStems' = Map.fromListWith grouping
 274   where
 275     grouping (GroupedText lt1 label1 score1 group1 s1 stem1 nodes1)
 276              (GroupedText lt2 label2 score2 group2 s2 stem2 nodes2)
 277              | score1 >= score2 = GroupedText lt label1 score1 (Set.insert label2 gr) s1 stem1 nodes
 278              | otherwise        = GroupedText lt label2 score2 (Set.insert label1 gr) s2 stem2 nodes
 279         where
 280           lt = lt1 <> lt2
 281           gr    = Set.union group1 group2
 282           nodes = Set.union nodes1 nodes2
 283
 284
 285
 286
 287 toNgramsElement :: GroupedText a -> [NgramsElement]
 288 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
 289   [parentElem] <> childrenElems
 290     where
 291       parent = label
 292       children = Set.toList setNgrams
 293       parentElem    = mkNgramsElement (NgramsTerm parent)
 294                                       (fromMaybe CandidateTerm listType)
 295                                       Nothing
 296                                       (mSetFromList (NgramsTerm <$> children))
 297       childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
 298                                                  (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
 299                                                  (mSetFromList [])
 300                           ) (NgramsTerm <$> children)
 301
 302
 303 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
 304 toGargList l n = (l,n)
 305
 306
 307 isStopTerm :: StopSize -> Text -> Bool
 308 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
 309   where
 310     isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
 311
 312
 313 ------------------------------------------------------------------------------
 314 type Group = Lang -> Int -> Int -> Text -> Text
 315 type Stem  = Text
 316 type Label = Text
 317 data GroupedText score =
 318   GroupedText { _gt_listType :: !(Maybe ListType)
 319               , _gt_label    :: !Label
 320               , _gt_score    :: !score
 321               , _gt_group    :: !(Set Text)
 322               , _gt_size     :: !Int
 323               , _gt_stem     :: !Stem
 324               , _gt_nodes    :: !(Set NodeId)
 325               }
 326 instance Show score => Show (GroupedText score) where
 327   show (GroupedText _ l s _ _ _ _) = show l <> ":" <> show s
 328
 329 instance (Eq a) => Eq (GroupedText a) where
 330   (==) (GroupedText _ _ score1 _ _ _ _)
 331        (GroupedText _ _ score2 _ _ _ _) = (==) score1 score2
 332
 333 instance (Eq a, Ord a) => Ord (GroupedText a) where
 334   compare (GroupedText _ _ score1 _ _ _ _)
 335           (GroupedText _ _ score2 _ _ _ _) = compare score1 score2
 336
 337
 338
 339 -- Lenses Instances
 340 makeLenses 'GroupedText