src/Gargantext/Database/Action/Flow.hs

   1 {-|
   2 Module      : Gargantext.Database.Flow
   3 Description : Database Flow
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 -- TODO-ACCESS:
  11 --   check userId       CanFillUserCorpus   userCorpusId
  12 --   check masterUserId CanFillMasterCorpus masterCorpusId
  13
  14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
  15 -- TODO-EVENTS: InsertedNodes
  16 -}
  17
  18 {-# OPTIONS_GHC -fno-warn-orphans #-}
  19
  20 {-# LANGUAGE ConstrainedClassMethods #-}
  21 {-# LANGUAGE ConstraintKinds         #-}
  22 {-# LANGUAGE InstanceSigs            #-}
  23 {-# LANGUAGE ScopedTypeVariables     #-}
  24 {-# LANGUAGE TemplateHaskell         #-}
  25
  26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
  27   ( DataText(..)
  28   , getDataText
  29   , flowDataText
  30   , flow
  31
  32   , flowCorpusFile
  33   , flowCorpus
  34   , flowAnnuaire
  35   , insertMasterDocs
  36   , saveDocNgramsWith
  37
  38   , getOrMkRoot
  39   , getOrMk_RootWithCorpus
  40   , TermType(..)
  41   , DataOrigin(..)
  42   , allDataOrigins
  43
  44   , do_api
  45   , indexAllDocumentsWithPosTag
  46   )
  47     where
  48
  49 import Control.Lens ((^.), view, _Just, makeLenses)
  50 import Data.Aeson.TH (deriveJSON)
  51 import Data.Either
  52 import Data.HashMap.Strict (HashMap)
  53 import Data.Hashable (Hashable)
  54 import Data.List (concat)
  55 import Data.Map (Map, lookup)
  56 import Data.Maybe (catMaybes)
  57 import Data.Monoid
  58 import Data.Swagger
  59 import qualified Data.Text as T
  60 import Data.Traversable (traverse)
  61 import Data.Tuple.Extra (first, second)
  62 import GHC.Generics (Generic)
  63 import System.FilePath (FilePath)
  64 import qualified Data.HashMap.Strict as HashMap
  65 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
  66 import qualified Data.Map as Map
  67
  68 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
  69 import Gargantext.Core (Lang(..), PosTagAlgo(..))
  70 import Gargantext.Core.Ext.IMT (toSchoolName)
  71 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
  72 import Gargantext.Core.Flow.Types
  73 import Gargantext.Core.Text
  74 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
  75 import Gargantext.Core.Text.List (buildNgramsLists)
  76 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
  77 import Gargantext.Core.Text.List.Social (FlowSocialListWith)
  78 import Gargantext.Core.Text.Terms
  79 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
  80 import Gargantext.Core.Types (POS(NP))
  81 import Gargantext.Core.Types.Individu (User(..))
  82 import Gargantext.Core.Types.Main
  83 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
  84 import Gargantext.Database.Action.Flow.List
  85 import Gargantext.Database.Action.Flow.Types
  86 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
  87 import Gargantext.Database.Action.Search (searchDocInDatabase)
  88 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
  89 import Gargantext.Database.Admin.Types.Hyperdata
  90 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
  91 import Gargantext.Database.Prelude
  92 import Gargantext.Database.Query.Table.Ngrams
  93 import Gargantext.Database.Query.Table.Node
  94 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
  95 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
  96 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
  97 import Gargantext.Database.Query.Table.ContextNodeNgrams2
  98 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
  99 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
 100 import Gargantext.Database.Types
 101 import Gargantext.Prelude
 102 import Gargantext.Prelude.Crypto.Hash (Hash)
 103 import qualified Gargantext.Core.Text.Corpus.API as API
 104 import qualified Gargantext.Database.Query.Table.Node.Document.Add  as Doc  (add)
 105
 106 ------------------------------------------------------------------------
 107 -- Imports for upgrade function
 108 import Gargantext.Database.Query.Tree.Root (getRootId)
 109 import Gargantext.Database.Query.Tree (findNodesId)
 110 import qualified Data.List as List
 111 ------------------------------------------------------------------------
 112 -- TODO use internal with API name (could be old data)
 113 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
 114                 | ExternalOrigin { _do_api :: API.ExternalAPIs }
 115                -- TODO Web
 116   deriving (Generic, Eq)
 117
 118 makeLenses ''DataOrigin
 119 deriveJSON (unPrefix "_do_") ''DataOrigin
 120 instance ToSchema DataOrigin where
 121   declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
 122
 123 allDataOrigins :: [DataOrigin]
 124 allDataOrigins = map InternalOrigin API.externalAPIs
 125               <> map ExternalOrigin API.externalAPIs
 126
 127 ---------------
 128 data DataText = DataOld ![NodeId]
 129               | DataNew ![[HyperdataDocument]]
 130
 131 -- TODO use the split parameter in config file
 132 getDataText :: FlowCmdM env err m
 133             => DataOrigin
 134             -> TermType Lang
 135             -> API.Query
 136             -> Maybe API.Limit
 137             -> m DataText
 138 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
 139                                   <$> splitEvery 500
 140                                   <$> API.get api (_tt_lang la) q li
 141
 142 getDataText (InternalOrigin _) _la q _li = do
 143   (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
 144                                            (UserName userMaster)
 145                                            (Left "")
 146                                            (Nothing :: Maybe HyperdataCorpus)
 147   ids <-  map fst <$> searchDocInDatabase cId (stemIt q)
 148   pure $ DataOld ids
 149
 150 -------------------------------------------------------------------------------
 151 flowDataText :: ( FlowCmdM env err m
 152                 )
 153                 => User
 154                 -> DataText
 155                 -> TermType Lang
 156                 -> CorpusId
 157                 -> Maybe FlowSocialListWith
 158                 -> (JobLog -> m ())
 159                 -> m CorpusId
 160 flowDataText u (DataOld ids) tt cid mfslw _ = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids mfslw
 161   where
 162     corpusType = (Nothing :: Maybe HyperdataCorpus)
 163 flowDataText u (DataNew txt) tt cid mfslw logStatus = flowCorpus u (Right [cid]) tt mfslw txt logStatus
 164
 165 ------------------------------------------------------------------------
 166 -- TODO use proxy
 167 flowAnnuaire :: (FlowCmdM env err m)
 168              => User
 169              -> Either CorpusName [CorpusId]
 170              -> (TermType Lang)
 171              -> FilePath
 172              -> (JobLog -> m ())
 173              -> m AnnuaireId
 174 flowAnnuaire u n l filePath logStatus = do
 175   docs <- liftBase $ (( splitEvery 500 <$> readFile_Annuaire filePath) :: IO [[HyperdataContact]])
 176   flow (Nothing :: Maybe HyperdataAnnuaire) u n l Nothing docs logStatus
 177
 178 ------------------------------------------------------------------------
 179 flowCorpusFile :: (FlowCmdM env err m)
 180            => User
 181            -> Either CorpusName [CorpusId]
 182            -> Limit -- Limit the number of docs (for dev purpose)
 183            -> TermType Lang -> FileFormat -> FilePath
 184            -> Maybe FlowSocialListWith
 185            -> (JobLog -> m ())
 186            -> m CorpusId
 187 flowCorpusFile u n l la ff fp mfslw logStatus = do
 188   eParsed <- liftBase $ parseFile ff fp
 189   case eParsed of
 190     Right parsed -> do
 191       let docs = splitEvery 500 $ take l parsed
 192       flowCorpus u n la mfslw (map (map toHyperdataDocument) docs) logStatus
 193     Left e       -> panic $ "Error: " <> (T.pack e)
 194
 195 ------------------------------------------------------------------------
 196 -- | TODO improve the needed type to create/update a corpus
 197 -- (For now, Either is enough)
 198 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
 199            => User
 200            -> Either CorpusName [CorpusId]
 201            -> TermType Lang
 202            -> Maybe FlowSocialListWith
 203            -> [[a]]
 204            -> (JobLog -> m ())
 205            -> m CorpusId
 206 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
 207
 208
 209 flow :: ( FlowCmdM env err m
 210         , FlowCorpus a
 211         , MkCorpus c
 212         )
 213         => Maybe c
 214         -> User
 215         -> Either CorpusName [CorpusId]
 216         -> TermType Lang
 217         -> Maybe FlowSocialListWith
 218         -> [[a]]
 219         -> (JobLog -> m ())
 220         -> m CorpusId
 221 flow c u cn la mfslw docs logStatus = do
 222   -- TODO if public insertMasterDocs else insertUserDocs
 223   ids <- traverse (\(idx, doc) -> do
 224                       id <- insertMasterDocs c la doc
 225                       logStatus JobLog { _scst_succeeded = Just $ 1 + idx
 226                                        , _scst_failed    = Just 0
 227                                        , _scst_remaining = Just $ length docs - idx
 228                                        , _scst_events    = Just []
 229                                        }
 230                       pure id
 231                   ) (zip [1..] docs)
 232   flowCorpusUser (la ^. tt_lang) u cn c (concat ids) mfslw
 233
 234
 235
 236
 237 ------------------------------------------------------------------------
 238 flowCorpusUser :: ( FlowCmdM env err m
 239                   , MkCorpus c
 240                   )
 241                => Lang
 242                -> User
 243                -> Either CorpusName [CorpusId]
 244                -> Maybe c
 245                -> [NodeId]
 246                -> Maybe FlowSocialListWith
 247                -> m CorpusId
 248 flowCorpusUser l user corpusName ctype ids mfslw = do
 249   -- User Flow
 250   (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
 251   -- NodeTexts is first
 252   _tId <- insertDefaultNode NodeTexts userCorpusId userId
 253   -- printDebug "NodeTexts: " tId
 254
 255   -- NodeList is second
 256   listId <- getOrMkList userCorpusId userId
 257   -- _cooc  <- insertDefaultNode NodeListCooc listId userId
 258   -- TODO: check if present already, ignore
 259   _ <- Doc.add userCorpusId ids
 260
 261   -- printDebug "Node Text Ids:" tId
 262
 263   -- User List Flow
 264   (masterUserId, _masterRootId, masterCorpusId)
 265     <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
 266
 267   --let gp = (GroupParams l 2 3 (StopSize 3))
 268   let gp = GroupWithPosTag l CoreNLP HashMap.empty
 269   ngs         <- buildNgramsLists user userCorpusId masterCorpusId mfslw gp
 270
 271   _userListId <- flowList_DbRepo listId ngs
 272   _mastListId <- getOrMkList masterCorpusId masterUserId
 273   -- _ <- insertOccsUpdates userCorpusId mastListId
 274   -- printDebug "userListId" userListId
 275   -- User Graph Flow
 276   _ <- insertDefaultNode NodeDashboard userCorpusId userId
 277   _ <- insertDefaultNode NodeGraph     userCorpusId userId
 278   --_ <- mkPhylo  userCorpusId userId
 279   -- Annuaire Flow
 280   -- _ <- mkAnnuaire  rootUserId userId
 281   pure userCorpusId
 282
 283
 284 insertMasterDocs :: ( FlowCmdM env err m
 285                     , FlowCorpus a
 286                     , MkCorpus   c
 287                     )
 288                  => Maybe c
 289                  -> TermType Lang
 290                  -> [a]
 291                  -> m [DocId]
 292 insertMasterDocs c lang hs  =  do
 293   (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
 294   (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
 295   _ <- Doc.add masterCorpusId ids'
 296   -- TODO
 297   -- create a corpus with database name (CSV or PubMed)
 298   -- add documents to the corpus (create node_node link)
 299   -- this will enable global database monitoring
 300
 301   -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
 302   mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
 303                 <- mapNodeIdNgrams
 304                 <$> documentIdWithNgrams
 305                     (extractNgramsT $ withLang lang documentsWithId)
 306                     documentsWithId
 307
 308   lId      <- getOrMkList masterCorpusId masterUserId
 309   _ <- saveDocNgramsWith lId mapNgramsDocs'
 310
 311   -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
 312   pure ids'
 313
 314 saveDocNgramsWith :: ( FlowCmdM env err m)
 315                   => ListId
 316                   -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
 317                   -> m ()
 318 saveDocNgramsWith lId mapNgramsDocs' = do
 319   terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
 320   let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
 321
 322   -- to be removed
 323   let indexedNgrams = HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
 324
 325   -- new
 326   mapCgramsId <- listInsertDb lId toNodeNgramsW'
 327                $ map (first _ngramsTerms . second Map.keys)
 328                $ HashMap.toList mapNgramsDocs
 329
 330   -- insertDocNgrams
 331   _return <- insertContextNodeNgrams2
 332            $ catMaybes [ ContextNodeNgrams2 <$> Just nId
 333                                             <*> (getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms''))
 334                                             <*> Just (fromIntegral w :: Double)
 335                        | (terms'', mapNgramsTypes)      <- HashMap.toList mapNgramsDocs
 336                        , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
 337                        , (nId, w)                       <- Map.toList mapNodeIdWeight
 338                        ]
 339   -- to be removed
 340   _   <- insertDocNgrams lId indexedNgrams
 341
 342   pure ()
 343
 344
 345 ------------------------------------------------------------------------
 346 -- TODO Type NodeDocumentUnicised
 347 insertDocs :: ( FlowCmdM env err m
 348               -- , FlowCorpus a
 349               , FlowInsertDB a
 350               )
 351               => UserId
 352               -> CorpusId
 353               -> [a]
 354               -> m ([DocId], [Indexed NodeId a])
 355 insertDocs uId cId hs = do
 356   let docs = map addUniqId hs
 357   newIds <- insertDb uId cId docs
 358   -- printDebug "newIds" newIds
 359   let
 360     newIds' = map reId newIds
 361     documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
 362   _ <- Doc.add cId newIds'
 363   pure (newIds', documentsWithId)
 364
 365
 366 ------------------------------------------------------------------------
 367 viewUniqId' :: UniqId a
 368             => a
 369             -> (Hash, a)
 370 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
 371       where
 372         err = panic "[ERROR] Database.Flow.toInsert"
 373
 374
 375 toInserted :: [ReturnId]
 376            -> Map Hash ReturnId
 377 toInserted =
 378   Map.fromList . map    (\r -> (reUniqId r, r)     )
 379                . filter (\r -> reInserted r == True)
 380
 381 mergeData :: Map Hash ReturnId
 382           -> Map Hash a
 383           -> [Indexed NodeId a]
 384 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
 385   where
 386     toDocumentWithId (sha,hpd) =
 387       Indexed <$> fmap reId (lookup sha rs)
 388               <*> Just hpd
 389
 390 ------------------------------------------------------------------------
 391 ------------------------------------------------------------------------
 392 ------------------------------------------------------------------------
 393 documentIdWithNgrams :: HasNodeError err
 394                      => (a
 395                      -> Cmd err (HashMap b (Map NgramsType Int)))
 396                      -> [Indexed NodeId a]
 397                      -> Cmd err [DocumentIdWithNgrams a b]
 398 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
 399   where
 400     toDocumentIdWithNgrams d = do
 401       e <- f $ _unIndex         d
 402       pure   $ DocumentIdWithNgrams d e
 403
 404
 405 -- | TODO check optimization
 406 mapNodeIdNgrams :: (Ord b, Hashable b)
 407                 => [DocumentIdWithNgrams a b]
 408                 -> HashMap b
 409                        (Map NgramsType
 410                             (Map NodeId Int)
 411                        )
 412 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
 413   where
 414     f :: DocumentIdWithNgrams a b
 415       -> HashMap b (Map NgramsType (Map NodeId Int))
 416     f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
 417       where
 418         nId = _index $ documentWithId d
 419
 420
 421 ------------------------------------------------------------------------
 422 instance ExtractNgramsT HyperdataContact
 423   where
 424     extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
 425       where
 426         extract :: TermType Lang -> HyperdataContact
 427                 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
 428         extract _l hc' = do
 429           let authors = map text2ngrams
 430                       $ maybe ["Nothing"] (\a -> [a])
 431                       $ view (hc_who . _Just . cw_lastName) hc'
 432
 433           pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
 434
 435
 436 instance ExtractNgramsT HyperdataDocument
 437   where
 438     extractNgramsT :: TermType Lang
 439                    -> HyperdataDocument
 440                    -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
 441     extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
 442       where
 443         extractNgramsT' :: TermType Lang
 444                         -> HyperdataDocument
 445                        -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
 446         extractNgramsT' lang' doc = do
 447           let source    = text2ngrams
 448                         $ maybe "Nothing" identity
 449                         $ _hd_source doc
 450
 451               institutes = map text2ngrams
 452                          $ maybe ["Nothing"] (map toSchoolName . (T.splitOn ", "))
 453                          $ _hd_institutes doc
 454
 455               authors    = map text2ngrams
 456                          $ maybe ["Nothing"] (T.splitOn ", ")
 457                          $ _hd_authors doc
 458
 459           terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
 460                  <$> concat
 461                  <$> liftBase (extractTerms lang' $ hasText doc)
 462
 463           pure $ HashMap.fromList
 464                $  [(SimpleNgrams source, Map.singleton Sources     1)                    ]
 465                <> [(SimpleNgrams     i', Map.singleton Institutes  1) | i' <- institutes ]
 466                <> [(SimpleNgrams     a', Map.singleton Authors     1) | a' <- authors    ]
 467                <> [(EnrichedNgrams   t', Map.singleton NgramsTerms 1) | t' <- terms'     ]
 468
 469 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
 470   where
 471     extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
 472
 473 instance HasText a => HasText (Node a)
 474   where
 475     hasText (Node _ _ _ _ _ _ _ h) = hasText h
 476
 477
 478
 479 -- | TODO putelsewhere
 480 -- | Upgrade function
 481 -- Suppose all documents are English (this is the case actually)
 482 indexAllDocumentsWithPosTag :: FlowCmdM env err m
 483                             => m ()
 484 indexAllDocumentsWithPosTag = do
 485   rootId    <- getRootId (UserName userMaster)
 486   corpusIds <- findNodesId rootId [NodeCorpus]
 487   docs      <- List.concat <$> mapM getDocumentsWithParentId corpusIds
 488   _ <- mapM extractInsert (splitEvery 1000 docs)
 489   pure ()
 490
 491 extractInsert :: FlowCmdM env err m
 492               => [Node HyperdataDocument] -> m ()
 493 extractInsert docs = do
 494   let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
 495   mapNgramsDocs' <- mapNodeIdNgrams
 496                 <$> documentIdWithNgrams
 497                     (extractNgramsT $ withLang (Multi EN) documentsWithId)
 498                     documentsWithId
 499   _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
 500   pure ()
 501
 502