src/Gargantext/Core/Text/List.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Ngrams.Lists
   3 Description : Tools to build lists
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 -}
  11
  12 {-# LANGUAGE TemplateHaskell   #-}
  13
  14 module Gargantext.Core.Text.List
  15   where
  16
  17 import Control.Lens (makeLenses)
  18 import Data.Maybe (fromMaybe, catMaybes)
  19 import Data.Ord (Down(..))
  20 import Data.Map (Map)
  21 import Data.Set (Set)
  22 import Data.Text (Text)
  23 import qualified Data.Char as Char
  24 import qualified Data.List as List
  25 import qualified Data.Map  as Map
  26 import qualified Data.Set  as Set
  27 import qualified Data.Text as Text
  28
  29 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
  30 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
  31 import Gargantext.Core (Lang(..))
  32 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
  33 import Gargantext.Database.Admin.Types.Node (NodeId)
  34 import Gargantext.Core.Text.Metrics (scored', Scored(..))
  35 import Gargantext.Database.Action.Metrics.NgramsByNode (ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
  36 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
  37 import Gargantext.Database.Query.Table.Node (defaultList)
  38 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
  39 import Gargantext.Database.Prelude (Cmd)
  40 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
  41
  42 import Gargantext.Prelude
  43 import Gargantext.Core.Text (size)
  44 import Gargantext.Core.Text.List.Learn (Model(..))
  45 -- import Gargantext.Core.Text.Metrics (takeScored)
  46
  47
  48 data NgramsListBuilder = BuilderStepO { stemSize :: !Int
  49                                       , stemX    :: !Int
  50                                       , stopSize :: !Int
  51                                       }
  52                        | BuilderStep1 { withModel :: !Model }
  53                        | BuilderStepN { withModel :: !Model }
  54                        | Tficf { nlb_lang           :: !Lang
  55                                , nlb_group1         :: !Int
  56                                , nlb_group2         :: !Int
  57                                , nlb_stopSize       :: !StopSize
  58                                , nlb_userCorpusId   :: !UserCorpusId
  59                                , nlb_masterCorpusId :: !MasterCorpusId
  60                                }
  61
  62
  63 data StopSize = StopSize {unStopSize :: !Int}
  64
  65 -- | TODO improve grouping functions of Authors, Sources, Institutes..
  66 buildNgramsLists :: HasNodeError err
  67                  => Lang
  68                  -> Int
  69                  -> Int
  70                  -> StopSize
  71                  -> UserCorpusId
  72                  -> MasterCorpusId
  73                  -> Cmd err (Map NgramsType [NgramsElement])
  74 buildNgramsLists l n m s uCid mCid = do
  75   ngTerms     <- buildNgramsTermsList l n m s uCid mCid
  76   othersTerms <- mapM (buildNgramsOthersList uCid identity)
  77                       [Authors, Sources, Institutes]
  78   pure $ Map.unions $ othersTerms <> [ngTerms]
  79
  80
  81 buildNgramsOthersList :: UserCorpusId
  82                       -> (Text -> Text)
  83                       -> NgramsType
  84                       -> Cmd err (Map NgramsType [NgramsElement])
  85 buildNgramsOthersList uCid groupIt nt = do
  86   ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
  87
  88   let
  89     listSize = 9
  90     all'     = List.reverse
  91              $ List.sortOn (Set.size . snd . snd)
  92              $ Map.toList ngs
  93
  94     graphTerms = List.take listSize all'
  95     candiTerms = List.drop listSize all'
  96
  97   pure $ Map.unionsWith (<>) [ toElements MapTerm     graphTerms
  98                              , toElements CandidateTerm candiTerms
  99                              ]
 100     where
 101       toElements nType x =
 102         Map.fromList [(nt, [ mkNgramsElement (NgramsTerm t) nType Nothing (mSetFromList [])
 103                            | (t, _ns) <- x
 104                            ]
 105                      )]
 106
 107 -- TODO use ListIds
 108 buildNgramsTermsList :: HasNodeError err
 109                      => Lang
 110                      -> Int
 111                      -> Int
 112                      -> StopSize
 113                      -> UserCorpusId
 114                      -> MasterCorpusId
 115                      -> Cmd err (Map NgramsType [NgramsElement])
 116 buildNgramsTermsList l n m s uCid mCid = do
 117
 118 -- Computing global speGen score
 119   allTerms <- Map.toList <$> getTficf uCid mCid NgramsTerms
 120
 121   -- printDebug "head candidates" (List.take 10 $ allTerms)
 122   -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
 123
 124   -- First remove stops terms
 125   let
 126     -- stopTerms ignored for now (need to be tagged already)
 127     (_stopTerms, candidateTerms) = List.partition ((isStopTerm s) . fst) allTerms
 128
 129   -- Grouping the ngrams and keeping the maximum score for label
 130   let grouped = groupStems'
 131         $ map (\(t,d) -> let stem = ngramsGroup l n m t
 132                           in ( stem
 133                              , GroupedText Nothing t d Set.empty (size t) stem Set.empty
 134                              )
 135               ) candidateTerms
 136
 137       (groupedMono, groupedMult) = Map.partition (\gt -> _gt_size gt < 2) grouped
 138
 139 -- printDebug "groupedMult" groupedMult
 140   -- splitting monterms and multiterms to take proportional candidates
 141   let
 142     listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if to small
 143     monoSize = 0.4  :: Double
 144     multSize = 1 - monoSize
 145
 146     splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
 147
 148     (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
 149     (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
 150
 151   printDebug "groupedMonoHead" (List.length groupedMonoHead)
 152   printDebug "groupedMonoTail" (List.length groupedMonoHead)
 153   printDebug "groupedMultHead" (List.length groupedMultHead)
 154   printDebug "groupedMultTail" (List.length groupedMultTail)
 155
 156   let
 157     -- Get Local Scores now for selected grouped ngrams
 158     selectedTerms = Set.toList $ List.foldl'
 159                       (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
 160                                                             $ Set.union g
 161                                                             $ Set.singleton l'
 162                       )
 163                       Set.empty
 164                       (groupedMonoHead <> groupedMultHead)
 165
 166   -- TO remove (and remove HasNodeError instance)
 167   userListId    <- defaultList uCid
 168   masterListId  <- defaultList mCid
 169
 170   mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
 171   let
 172     mapGroups   = Map.fromList
 173                 $ map (\g -> (_gt_stem g, g))
 174                 $ groupedMonoHead <> groupedMultHead
 175
 176     -- grouping with Set NodeId
 177     contextsAdded = foldl' (\mapGroups' k -> let k' = ngramsGroup l n m k
 178                                     in case Map.lookup k' mapGroups'  of
 179                                       Nothing -> mapGroups'
 180                                       Just g  -> case Map.lookup k mapTextDocIds of
 181                                                    Nothing -> mapGroups'
 182                                                    Just ns -> Map.insert k' ( g { _gt_nodes = Set.union ns (_gt_nodes g)}) mapGroups'
 183                            )
 184                   mapGroups
 185                   $ Map.keys mapTextDocIds
 186
 187     -- compute cooccurrences
 188     mapCooc = Map.filter (>2) $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
 189                            | (t1, s1) <- mapStemNodeIds
 190                            , (t2, s2) <- mapStemNodeIds
 191                            ]
 192       where
 193         mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
 194   -- printDebug "mapCooc" mapCooc
 195
 196   let
 197     -- computing scores
 198     mapScores f = Map.fromList $ map (\(Scored t g s') -> (t, f (g,s'))) $ scored' mapCooc
 199
 200     groupsWithScores = catMaybes
 201                      $ map (\(stem, g)
 202                              -> case Map.lookup stem mapScores' of
 203                                  Nothing -> Nothing
 204                                  Just s'  -> Just $ g { _gt_score = s'}
 205                             ) $ Map.toList contextsAdded
 206       where
 207         mapScores' = mapScores adapt1 -- identity
 208         adapt1 (s1,s2) = (log' 5 s1, log' 2 s2)
 209         log' n' x     = 1 + (if x <= 0 then 0 else log $ (10^(n'::Int)) * x)
 210         -- adapt2 TOCHECK with DC
 211   -- printDebug "groupsWithScores" groupsWithScores
 212   let
 213     -- sort / partition / split
 214       -- filter mono/multi again
 215     (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
 216       -- filter with max score
 217     partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
 218
 219     (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
 220     (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
 221
 222       -- splitAt
 223   let
 224     listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
 225     inclSize = 0.4  :: Double
 226     exclSize = 1 - inclSize
 227     splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
 228
 229     (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
 230     (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
 231
 232     (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
 233     (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
 234
 235
 236     -- Final Step building the Typed list
 237     -- (map (toGargList $ Just StopTerm) stopTerms) -- Removing stops (needs social score)
 238     termListHead =
 239              (map (\g -> g { _gt_listType = Just MapTerm} )  (  monoScoredInclHead
 240                                                  <> monoScoredExclHead
 241                                                  <> multScoredInclHead
 242                                                  <> multScoredExclHead
 243                                                  )
 244                )
 245             <> (map (\g -> g { _gt_listType = Just CandidateTerm }) (  monoScoredInclTail
 246                                                       <> monoScoredExclTail
 247                                                       <> multScoredInclTail
 248                                                       <> multScoredExclTail
 249                                                       )
 250                )
 251
 252     termListTail = map (\g -> g { _gt_listType = Just CandidateTerm }) ( groupedMonoTail <> groupedMultTail)
 253
 254 --  printDebug "monoScoredInclHead" monoScoredInclHead
 255 --  printDebug "monoScoredExclHead" monoScoredExclTail
 256 --
 257   printDebug "multScoredInclHead" multScoredInclHead
 258   printDebug "multScoredExclTail" multScoredExclTail
 259
 260
 261
 262   pure $ Map.fromList [(NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
 263                                   <> (List.concat $ map toNgramsElement $ termListTail)
 264                        )
 265                       ]
 266
 267 groupStems :: [(Stem, GroupedText Double)] -> [GroupedText Double]
 268 groupStems = Map.elems . groupStems'
 269
 270 groupStems' :: [(Stem, GroupedText Double)] -> Map Stem (GroupedText Double)
 271 groupStems' = Map.fromListWith grouping
 272   where
 273     grouping (GroupedText lt1 label1 score1 group1 s1 stem1 nodes1)
 274              (GroupedText lt2 label2 score2 group2 s2 stem2 nodes2)
 275              | score1 >= score2 = GroupedText lt label1 score1 (Set.insert label2 gr) s1 stem1 nodes
 276              | otherwise        = GroupedText lt label2 score2 (Set.insert label1 gr) s2 stem2 nodes
 277         where
 278           lt = lt1 <> lt2
 279           gr    = Set.union group1 group2
 280           nodes = Set.union nodes1 nodes2
 281
 282
 283
 284
 285 toNgramsElement :: GroupedText a -> [NgramsElement]
 286 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
 287   [parentElem] <> childrenElems
 288     where
 289       parent = label
 290       children = Set.toList setNgrams
 291       parentElem    = mkNgramsElement (NgramsTerm parent)
 292                                       (fromMaybe CandidateTerm listType)
 293                                       Nothing
 294                                       (mSetFromList (NgramsTerm <$> children))
 295       childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
 296                                                  (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
 297                                                  (mSetFromList [])
 298                           ) (NgramsTerm <$> children)
 299
 300
 301 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
 302 toGargList l n = (l,n)
 303
 304
 305 isStopTerm :: StopSize -> Text -> Bool
 306 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
 307   where
 308     isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
 309
 310
 311 ------------------------------------------------------------------------------
 312 type Group = Lang -> Int -> Int -> Text -> Text
 313 type Stem  = Text
 314 type Label = Text
 315 data GroupedText score =
 316   GroupedText { _gt_listType :: !(Maybe ListType)
 317               , _gt_label    :: !Label
 318               , _gt_score    :: !score
 319               , _gt_group    :: !(Set Text)
 320               , _gt_size     :: !Int
 321               , _gt_stem     :: !Stem
 322               , _gt_nodes    :: !(Set NodeId)
 323               }
 324 instance Show score => Show (GroupedText score) where
 325   show (GroupedText _ l s _ _ _ _) = show l <> ":" <> show s
 326
 327 instance (Eq a) => Eq (GroupedText a) where
 328   (==) (GroupedText _ _ score1 _ _ _ _)
 329        (GroupedText _ _ score2 _ _ _ _) = (==) score1 score2
 330
 331 instance (Eq a, Ord a) => Ord (GroupedText a) where
 332   compare (GroupedText _ _ score1 _ _ _ _)
 333           (GroupedText _ _ score2 _ _ _ _) = compare score1 score2
 334
 335
 336
 337 -- Lenses Instances
 338 makeLenses 'GroupedText