src/Gargantext/Database/Action/Flow.hs

   1 {-|
   2 Module      : Gargantext.Database.Flow
   3 Description : Database Flow
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 -- TODO-ACCESS:
  11 --   check userId       CanFillUserCorpus   userCorpusId
  12 --   check masterUserId CanFillMasterCorpus masterCorpusId
  13
  14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
  15 -- TODO-EVENTS: InsertedNodes
  16 -}
  17
  18 {-# OPTIONS_GHC -fno-warn-orphans #-}
  19
  20 {-# LANGUAGE ConstrainedClassMethods #-}
  21 {-# LANGUAGE ConstraintKinds         #-}
  22 {-# LANGUAGE InstanceSigs            #-}
  23 {-# LANGUAGE ScopedTypeVariables     #-}
  24 {-# LANGUAGE TemplateHaskell         #-}
  25
  26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
  27   ( DataText(..)
  28   , getDataText
  29   , flowDataText
  30   , flow
  31
  32   , flowCorpusFile
  33   , flowCorpus
  34   , flowAnnuaire
  35   , insertMasterDocs
  36   , saveDocNgramsWith
  37
  38   , getOrMkRoot
  39   , getOrMk_RootWithCorpus
  40   , TermType(..)
  41   , DataOrigin(..)
  42   , allDataOrigins
  43
  44   , do_api
  45   , indexAllDocumentsWithPosTag
  46   )
  47     where
  48
  49 import Control.Lens ((^.), view, _Just, makeLenses)
  50 import Data.Aeson.TH (deriveJSON)
  51 import Data.Either
  52 import Data.HashMap.Strict (HashMap)
  53 import Data.Hashable (Hashable)
  54 import Data.List (concat)
  55 import Data.Map (Map, lookup)
  56 import Data.Maybe (catMaybes)
  57 import Data.Monoid
  58 import Data.Swagger
  59 import qualified Data.Text as T
  60 import Data.Traversable (traverse)
  61 import Data.Tuple.Extra (first, second)
  62 import GHC.Generics (Generic)
  63 import System.FilePath (FilePath)
  64 import qualified Data.HashMap.Strict as HashMap
  65 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
  66 import qualified Data.Map as Map
  67
  68 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
  69 import Gargantext.Core (Lang(..), PosTagAlgo(..))
  70 import Gargantext.Core.Ext.IMT (toSchoolName)
  71 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
  72 import Gargantext.Core.Flow.Types
  73 import Gargantext.Core.Text
  74 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
  75 import Gargantext.Core.Text.List (buildNgramsLists)
  76 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
  77 import Gargantext.Core.Text.List.Social (FlowSocialListWith)
  78 import Gargantext.Core.Text.Terms
  79 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
  80 import Gargantext.Core.Types (POS(NP))
  81 import Gargantext.Core.Types.Individu (User(..))
  82 import Gargantext.Core.Types.Main
  83 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
  84 import Gargantext.Database.Action.Flow.List
  85 import Gargantext.Database.Action.Flow.Types
  86 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
  87 import Gargantext.Database.Action.Search (searchDocInDatabase)
  88 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
  89 import Gargantext.Database.Admin.Types.Hyperdata
  90 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
  91 import Gargantext.Database.Prelude
  92 import Gargantext.Database.Query.Table.ContextNodeNgrams2
  93 import Gargantext.Database.Query.Table.Ngrams
  94 import Gargantext.Database.Query.Table.Node
  95 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
  96 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
  97 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
  98 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
  99 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
 100 import Gargantext.Database.Types
 101 import Gargantext.Prelude
 102 import Gargantext.Prelude.Crypto.Hash (Hash)
 103 import qualified Gargantext.Core.Text.Corpus.API as API
 104 import qualified Gargantext.Database.Query.Table.Node.Document.Add  as Doc  (add)
 105
 106 ------------------------------------------------------------------------
 107 -- Imports for upgrade function
 108 import Gargantext.Database.Query.Tree.Root (getRootId)
 109 import Gargantext.Database.Query.Tree (findNodesId)
 110 import qualified Data.List as List
 111 ------------------------------------------------------------------------
 112 -- TODO use internal with API name (could be old data)
 113 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
 114                 | ExternalOrigin { _do_api :: API.ExternalAPIs }
 115                -- TODO Web
 116   deriving (Generic, Eq)
 117
 118 makeLenses ''DataOrigin
 119 deriveJSON (unPrefix "_do_") ''DataOrigin
 120 instance ToSchema DataOrigin where
 121   declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
 122
 123 allDataOrigins :: [DataOrigin]
 124 allDataOrigins = map InternalOrigin API.externalAPIs
 125               <> map ExternalOrigin API.externalAPIs
 126
 127 ---------------
 128 data DataText = DataOld ![NodeId]
 129               | DataNew ![[HyperdataDocument]]
 130
 131 -- TODO use the split parameter in config file
 132 getDataText :: FlowCmdM env err m
 133             => DataOrigin
 134             -> TermType Lang
 135             -> API.Query
 136             -> Maybe API.Limit
 137             -> m DataText
 138 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
 139                                   <$> splitEvery 500
 140                                   <$> API.get api (_tt_lang la) q li
 141
 142 getDataText (InternalOrigin _) _la q _li = do
 143   (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
 144                                            (UserName userMaster)
 145                                            (Left "")
 146                                            (Nothing :: Maybe HyperdataCorpus)
 147   ids <-  map fst <$> searchDocInDatabase cId (stemIt q)
 148   pure $ DataOld ids
 149
 150 -------------------------------------------------------------------------------
 151 flowDataText :: ( FlowCmdM env err m
 152                 )
 153                 => User
 154                 -> DataText
 155                 -> TermType Lang
 156                 -> CorpusId
 157                 -> Maybe FlowSocialListWith
 158                 -> (JobLog -> m ())
 159                 -> m CorpusId
 160 flowDataText u (DataOld ids) tt cid mfslw _ = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids mfslw
 161   where
 162     corpusType = (Nothing :: Maybe HyperdataCorpus)
 163 flowDataText u (DataNew txt) tt cid mfslw logStatus = flowCorpus u (Right [cid]) tt mfslw txt logStatus
 164
 165 ------------------------------------------------------------------------
 166 -- TODO use proxy
 167 flowAnnuaire :: (FlowCmdM env err m)
 168              => User
 169              -> Either CorpusName [CorpusId]
 170              -> (TermType Lang)
 171              -> FilePath
 172              -> (JobLog -> m ())
 173              -> m AnnuaireId
 174 flowAnnuaire u n l filePath logStatus = do
 175   docs <- liftBase $ (( splitEvery 500 <$> readFile_Annuaire filePath) :: IO [[HyperdataContact]])
 176   flow (Nothing :: Maybe HyperdataAnnuaire) u n l Nothing docs logStatus
 177
 178 ------------------------------------------------------------------------
 179 flowCorpusFile :: (FlowCmdM env err m)
 180            => User
 181            -> Either CorpusName [CorpusId]
 182            -> Limit -- Limit the number of docs (for dev purpose)
 183            -> TermType Lang -> FileFormat -> FilePath
 184            -> Maybe FlowSocialListWith
 185            -> (JobLog -> m ())
 186            -> m CorpusId
 187 flowCorpusFile u n l la ff fp mfslw logStatus = do
 188   eParsed <- liftBase $ parseFile ff fp
 189   case eParsed of
 190     Right parsed -> do
 191       let docs = splitEvery 500 $ take l parsed
 192       flowCorpus u n la mfslw (map (map toHyperdataDocument) docs) logStatus
 193     Left e       -> panic $ "Error: " <> (T.pack e)
 194
 195 ------------------------------------------------------------------------
 196 -- | TODO improve the needed type to create/update a corpus
 197 -- (For now, Either is enough)
 198 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
 199            => User
 200            -> Either CorpusName [CorpusId]
 201            -> TermType Lang
 202            -> Maybe FlowSocialListWith
 203            -> [[a]]
 204            -> (JobLog -> m ())
 205            -> m CorpusId
 206 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
 207
 208
 209 flow :: ( FlowCmdM env err m
 210         , FlowCorpus a
 211         , MkCorpus c
 212         )
 213         => Maybe c
 214         -> User
 215         -> Either CorpusName [CorpusId]
 216         -> TermType Lang
 217         -> Maybe FlowSocialListWith
 218         -> [[a]]
 219         -> (JobLog -> m ())
 220         -> m CorpusId
 221 flow c u cn la mfslw docs logStatus = do
 222   -- TODO if public insertMasterDocs else insertUserDocs
 223   ids <- traverse (\(idx, doc) -> do
 224                       id <- insertMasterDocs c la doc
 225                       logStatus JobLog { _scst_succeeded = Just $ 1 + idx
 226                                        , _scst_failed    = Just 0
 227                                        , _scst_remaining = Just $ length docs - idx
 228                                        , _scst_events    = Just []
 229                                        }
 230                       pure id
 231                   ) (zip [1..] docs)
 232   flowCorpusUser (la ^. tt_lang) u cn c (concat ids) mfslw
 233
 234
 235
 236
 237 ------------------------------------------------------------------------
 238 flowCorpusUser :: ( FlowCmdM env err m
 239                   , MkCorpus c
 240                   )
 241                => Lang
 242                -> User
 243                -> Either CorpusName [CorpusId]
 244                -> Maybe c
 245                -> [NodeId]
 246                -> Maybe FlowSocialListWith
 247                -> m CorpusId
 248 flowCorpusUser l user corpusName ctype ids mfslw = do
 249   -- User Flow
 250   (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
 251   -- NodeTexts is first
 252   _tId <- insertDefaultNode NodeTexts userCorpusId userId
 253   -- printDebug "NodeTexts: " tId
 254
 255   -- NodeList is second
 256   listId <- getOrMkList userCorpusId userId
 257   -- _cooc  <- insertDefaultNode NodeListCooc listId userId
 258   -- TODO: check if present already, ignore
 259   _ <- Doc.add userCorpusId ids
 260
 261   -- printDebug "Node Text Ids:" tId
 262
 263   -- User List Flow
 264   (masterUserId, _masterRootId, masterCorpusId)
 265     <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
 266
 267   --let gp = (GroupParams l 2 3 (StopSize 3))
 268   -- Here the PosTagAlgo should be chosen according the Lang
 269   let gp = GroupWithPosTag l CoreNLP HashMap.empty
 270   ngs         <- buildNgramsLists user userCorpusId masterCorpusId mfslw gp
 271
 272   -- printDebug "flowCorpusUser:ngs" ngs
 273
 274   _userListId <- flowList_DbRepo listId ngs
 275   _mastListId <- getOrMkList masterCorpusId masterUserId
 276   -- _ <- insertOccsUpdates userCorpusId mastListId
 277   -- printDebug "userListId" userListId
 278   -- User Graph Flow
 279   _ <- insertDefaultNode NodeDashboard userCorpusId userId
 280   _ <- insertDefaultNode NodeGraph     userCorpusId userId
 281   --_ <- mkPhylo  userCorpusId userId
 282   -- Annuaire Flow
 283   -- _ <- mkAnnuaire  rootUserId userId
 284   pure userCorpusId
 285
 286
 287 insertMasterDocs :: ( FlowCmdM env err m
 288                     , FlowCorpus a
 289                     , MkCorpus   c
 290                     )
 291                  => Maybe c
 292                  -> TermType Lang
 293                  -> [a]
 294                  -> m [DocId]
 295 insertMasterDocs c lang hs  =  do
 296   (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
 297   (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
 298   _ <- Doc.add masterCorpusId ids'
 299   -- TODO
 300   -- create a corpus with database name (CSV or PubMed)
 301   -- add documents to the corpus (create node_node link)
 302   -- this will enable global database monitoring
 303
 304   -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
 305   mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
 306                 <- mapNodeIdNgrams
 307                 <$> documentIdWithNgrams
 308                     (extractNgramsT $ withLang lang documentsWithId)
 309                     documentsWithId
 310
 311   lId      <- getOrMkList masterCorpusId masterUserId
 312   _ <- saveDocNgramsWith lId mapNgramsDocs'
 313
 314   -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
 315   pure ids'
 316
 317 saveDocNgramsWith :: ( FlowCmdM env err m)
 318                   => ListId
 319                   -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
 320                   -> m ()
 321 saveDocNgramsWith lId mapNgramsDocs' = do
 322   terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
 323   let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
 324
 325   -- new
 326   mapCgramsId <- listInsertDb lId toNodeNgramsW'
 327                $ map (first _ngramsTerms . second Map.keys)
 328                $ HashMap.toList mapNgramsDocs
 329
 330   -- printDebug "saveDocNgramsWith" mapCgramsId
 331   -- insertDocNgrams
 332   _return <- insertContextNodeNgrams2
 333            $ catMaybes [ ContextNodeNgrams2 <$> Just nId
 334                                             <*> (getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms''))
 335                                             <*> Just (fromIntegral w :: Double)
 336                        | (terms'', mapNgramsTypes)      <- HashMap.toList mapNgramsDocs
 337                        , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
 338                        , (nId, w)                       <- Map.toList mapNodeIdWeight
 339                        ]
 340
 341   -- to be removed
 342   _   <- insertDocNgrams lId $ HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
 343
 344   pure ()
 345
 346
 347 ------------------------------------------------------------------------
 348 -- TODO Type NodeDocumentUnicised
 349 insertDocs :: ( FlowCmdM env err m
 350               -- , FlowCorpus a
 351               , FlowInsertDB a
 352               )
 353               => UserId
 354               -> CorpusId
 355               -> [a]
 356               -> m ([ContextId], [Indexed ContextId a])
 357 insertDocs uId cId hs = do
 358   let docs = map addUniqId hs
 359   newIds <- insertDb uId cId docs
 360   -- printDebug "newIds" newIds
 361   let
 362     newIds' = map reId newIds
 363     documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
 364   _ <- Doc.add cId newIds'
 365   pure (newIds', documentsWithId)
 366
 367
 368 ------------------------------------------------------------------------
 369 viewUniqId' :: UniqId a
 370             => a
 371             -> (Hash, a)
 372 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
 373       where
 374         err = panic "[ERROR] Database.Flow.toInsert"
 375
 376
 377 toInserted :: [ReturnId]
 378            -> Map Hash ReturnId
 379 toInserted =
 380   Map.fromList . map    (\r -> (reUniqId r, r)     )
 381                . filter (\r -> reInserted r == True)
 382
 383 mergeData :: Map Hash ReturnId
 384           -> Map Hash a
 385           -> [Indexed NodeId a]
 386 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
 387   where
 388     toDocumentWithId (sha,hpd) =
 389       Indexed <$> fmap reId (lookup sha rs)
 390               <*> Just hpd
 391
 392 ------------------------------------------------------------------------
 393 ------------------------------------------------------------------------
 394 ------------------------------------------------------------------------
 395 documentIdWithNgrams :: HasNodeError err
 396                      => (a
 397                      -> Cmd err (HashMap b (Map NgramsType Int)))
 398                      -> [Indexed NodeId a]
 399                      -> Cmd err [DocumentIdWithNgrams a b]
 400 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
 401   where
 402     toDocumentIdWithNgrams d = do
 403       e <- f $ _unIndex         d
 404       pure   $ DocumentIdWithNgrams d e
 405
 406
 407 -- | TODO check optimization
 408 mapNodeIdNgrams :: (Ord b, Hashable b)
 409                 => [DocumentIdWithNgrams a b]
 410                 -> HashMap b
 411                        (Map NgramsType
 412                             (Map NodeId Int)
 413                        )
 414 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
 415   where
 416     f :: DocumentIdWithNgrams a b
 417       -> HashMap b (Map NgramsType (Map NodeId Int))
 418     f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
 419       where
 420         nId = _index $ documentWithId d
 421
 422
 423 ------------------------------------------------------------------------
 424 instance ExtractNgramsT HyperdataContact
 425   where
 426     extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
 427       where
 428         extract :: TermType Lang -> HyperdataContact
 429                 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
 430         extract _l hc' = do
 431           let authors = map text2ngrams
 432                       $ maybe ["Nothing"] (\a -> [a])
 433                       $ view (hc_who . _Just . cw_lastName) hc'
 434
 435           pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
 436
 437
 438 instance ExtractNgramsT HyperdataDocument
 439   where
 440     extractNgramsT :: TermType Lang
 441                    -> HyperdataDocument
 442                    -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
 443     extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
 444       where
 445         extractNgramsT' :: TermType Lang
 446                         -> HyperdataDocument
 447                        -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
 448         extractNgramsT' lang' doc = do
 449           let source    = text2ngrams
 450                         $ maybe "Nothing" identity
 451                         $ _hd_source doc
 452
 453               institutes = map text2ngrams
 454                          $ maybe ["Nothing"] (map toSchoolName . (T.splitOn ", "))
 455                          $ _hd_institutes doc
 456
 457               authors    = map text2ngrams
 458                          $ maybe ["Nothing"] (T.splitOn ", ")
 459                          $ _hd_authors doc
 460
 461           terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
 462                  <$> concat
 463                  <$> liftBase (extractTerms lang' $ hasText doc)
 464
 465           pure $ HashMap.fromList
 466                $  [(SimpleNgrams source, Map.singleton Sources     1)                    ]
 467                <> [(SimpleNgrams     i', Map.singleton Institutes  1) | i' <- institutes ]
 468                <> [(SimpleNgrams     a', Map.singleton Authors     1) | a' <- authors    ]
 469                <> [(EnrichedNgrams   t', Map.singleton NgramsTerms 1) | t' <- terms'     ]
 470
 471 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
 472   where
 473     extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
 474
 475 instance HasText a => HasText (Node a)
 476   where
 477     hasText (Node _ _ _ _ _ _ _ h) = hasText h
 478
 479
 480
 481 -- | TODO putelsewhere
 482 -- | Upgrade function
 483 -- Suppose all documents are English (this is the case actually)
 484 indexAllDocumentsWithPosTag :: FlowCmdM env err m
 485                             => m ()
 486 indexAllDocumentsWithPosTag = do
 487   rootId    <- getRootId (UserName userMaster)
 488   corpusIds <- findNodesId rootId [NodeCorpus]
 489   docs      <- List.concat <$> mapM getDocumentsWithParentId corpusIds
 490   _ <- mapM extractInsert (splitEvery 1000 docs)
 491   pure ()
 492
 493 extractInsert :: FlowCmdM env err m
 494               => [Node HyperdataDocument] -> m ()
 495 extractInsert docs = do
 496   let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
 497   mapNgramsDocs' <- mapNodeIdNgrams
 498                 <$> documentIdWithNgrams
 499                     (extractNgramsT $ withLang (Multi EN) documentsWithId)
 500                     documentsWithId
 501   _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
 502   pure ()
 503
 504