src/Gargantext/Database/Action/Flow.hs

   1 {-|
   2 Module      : Gargantext.Database.Flow
   3 Description : Database Flow
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 -- TODO-ACCESS:
  11 --   check userId       CanFillUserCorpus   userCorpusId
  12 --   check masterUserId CanFillMasterCorpus masterCorpusId
  13
  14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
  15 -- TODO-EVENTS: InsertedNodes
  16 -}
  17
  18 {-# OPTIONS_GHC -fno-warn-orphans #-}
  19
  20 {-# LANGUAGE ConstrainedClassMethods #-}
  21 {-# LANGUAGE ConstraintKinds         #-}
  22 {-# LANGUAGE InstanceSigs            #-}
  23 {-# LANGUAGE ScopedTypeVariables     #-}
  24 {-# LANGUAGE TemplateHaskell         #-}
  25
  26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
  27   ( FlowCmdM
  28   , getDataText
  29   , flowDataText
  30   , flow
  31
  32   , flowCorpusFile
  33   , flowCorpus
  34   , flowAnnuaire
  35   , insertMasterDocs
  36
  37   , getOrMkRoot
  38   , getOrMk_RootWithCorpus
  39   , TermType(..)
  40   , DataOrigin(..)
  41   , allDataOrigins
  42
  43   , do_api
  44   , indexAllDocumentsWithPosTag
  45   )
  46     where
  47
  48 import Control.Lens ((^.), view, _Just, makeLenses)
  49 import Data.Aeson.TH (deriveJSON)
  50 import Data.Either
  51 import Data.HashMap.Strict (HashMap)
  52 import Data.Hashable (Hashable)
  53 import Data.List (concat)
  54 import Data.Map (Map, lookup)
  55 import Data.Maybe (catMaybes)
  56 import Data.Monoid
  57 import Data.Swagger
  58 import Data.Text (splitOn)
  59 import Data.Traversable (traverse)
  60 import Data.Tuple.Extra (first, second)
  61 import GHC.Generics (Generic)
  62 import System.FilePath (FilePath)
  63 import qualified Data.HashMap.Strict as HashMap
  64 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
  65 import qualified Data.Map as Map
  66
  67 import Gargantext.Core (Lang(..), PosTagAlgo(..))
  68 import Gargantext.Core.Ext.IMT (toSchoolName)
  69 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
  70 import Gargantext.Core.Flow.Types
  71 import Gargantext.Core.Text
  72 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
  73 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
  74 import Gargantext.Core.Text.List (buildNgramsLists)
  75 import Gargantext.Core.Text.Terms
  76 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
  77 import Gargantext.Core.Types (POS(NP))
  78 import Gargantext.Core.Types.Individu (User(..))
  79 import Gargantext.Core.Types.Main
  80 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
  81 import Gargantext.Database.Action.Flow.List
  82 import Gargantext.Database.Action.Flow.Types
  83 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
  84 import Gargantext.Database.Action.Search (searchDocInDatabase)
  85 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
  86 import Gargantext.Database.Admin.Types.Hyperdata
  87 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
  88 import Gargantext.Database.Prelude
  89 import Gargantext.Database.Query.Table.Ngrams
  90 import Gargantext.Database.Query.Table.Node
  91 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
  92 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
  93 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
  94 import Gargantext.Database.Query.Table.NodeNodeNgrams2
  95 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
  96 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
  97 import Gargantext.Database.Types
  98 import Gargantext.Prelude
  99 import Gargantext.Prelude.Crypto.Hash (Hash)
 100 import qualified Gargantext.Core.Text.Corpus.API as API
 101 import qualified Gargantext.Database.Query.Table.Node.Document.Add  as Doc  (add)
 102
 103 ------------------------------------------------------------------------
 104 -- Impots for upgrade function
 105 import Gargantext.Database.Query.Tree.Root (getRootId)
 106 import Gargantext.Database.Query.Tree (findNodesId)
 107 import qualified Data.List as List
 108 ------------------------------------------------------------------------
 109 -- TODO use internal with API name (could be old data)
 110 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
 111                 | ExternalOrigin { _do_api :: API.ExternalAPIs }
 112                -- TODO Web
 113   deriving (Generic, Eq)
 114
 115 makeLenses ''DataOrigin
 116 deriveJSON (unPrefix "_do_") ''DataOrigin
 117 instance ToSchema DataOrigin where
 118   declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
 119
 120 allDataOrigins :: [DataOrigin]
 121 allDataOrigins = map InternalOrigin API.externalAPIs
 122               <> map ExternalOrigin API.externalAPIs
 123
 124 ---------------
 125 data DataText = DataOld ![NodeId]
 126               | DataNew ![[HyperdataDocument]]
 127
 128 -- TODO use the split parameter in config file
 129 getDataText :: FlowCmdM env err m
 130             => DataOrigin
 131             -> TermType Lang
 132             -> API.Query
 133             -> Maybe API.Limit
 134             -> m DataText
 135 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
 136                                   <$> splitEvery 500
 137                                   <$> API.get api (_tt_lang la) q li
 138
 139 getDataText (InternalOrigin _) _la q _li = do
 140   (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
 141                                            (UserName userMaster)
 142                                            (Left "")
 143                                            (Nothing :: Maybe HyperdataCorpus)
 144   ids <-  map fst <$> searchDocInDatabase cId (stemIt q)
 145   pure $ DataOld ids
 146
 147 -------------------------------------------------------------------------------
 148 flowDataText :: ( FlowCmdM env err m
 149                 )
 150                 => User
 151                 -> DataText
 152                 -> TermType Lang
 153                 -> CorpusId
 154                 -> m CorpusId
 155 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
 156   where
 157     corpusType = (Nothing :: Maybe HyperdataCorpus)
 158 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
 159
 160 ------------------------------------------------------------------------
 161 -- TODO use proxy
 162 flowAnnuaire :: (FlowCmdM env err m)
 163              => User
 164              -> Either CorpusName [CorpusId]
 165              -> (TermType Lang)
 166              -> FilePath
 167              -> m AnnuaireId
 168 flowAnnuaire u n l filePath = do
 169   docs <- liftBase $ (( splitEvery 500 <$> readFile_Annuaire filePath) :: IO [[HyperdataContact]])
 170   flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
 171
 172 ------------------------------------------------------------------------
 173 flowCorpusFile :: (FlowCmdM env err m)
 174            => User
 175            -> Either CorpusName [CorpusId]
 176            -> Limit -- Limit the number of docs (for dev purpose)
 177            -> TermType Lang -> FileFormat -> FilePath
 178            -> m CorpusId
 179 flowCorpusFile u n l la ff fp = do
 180   docs <- liftBase ( splitEvery 500
 181                  <$> take l
 182                  <$> parseFile ff fp
 183                  )
 184   flowCorpus u n la (map (map toHyperdataDocument) docs)
 185
 186 ------------------------------------------------------------------------
 187 -- | TODO improve the needed type to create/update a corpus
 188 -- (For now, Either is enough)
 189 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
 190            => User
 191            -> Either CorpusName [CorpusId]
 192            -> TermType Lang
 193            -> [[a]]
 194            -> m CorpusId
 195 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
 196
 197
 198 flow :: ( FlowCmdM env err m
 199         , FlowCorpus a
 200         , MkCorpus c
 201         )
 202         => Maybe c
 203         -> User
 204         -> Either CorpusName [CorpusId]
 205         -> TermType Lang
 206         -> [[a]]
 207         -> m CorpusId
 208 flow c u cn la docs = do
 209   -- TODO if public insertMasterDocs else insertUserDocs
 210   ids <- traverse (insertMasterDocs c la) docs
 211   flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
 212
 213 ------------------------------------------------------------------------
 214 flowCorpusUser :: ( FlowCmdM env err m
 215                   , MkCorpus c
 216                   )
 217                => Lang
 218                -> User
 219                -> Either CorpusName [CorpusId]
 220                -> Maybe c
 221                -> [NodeId]
 222                -> m CorpusId
 223 flowCorpusUser l user corpusName ctype ids = do
 224   -- User Flow
 225   (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
 226   -- NodeTexts is first
 227   _tId <- insertDefaultNode NodeTexts userCorpusId userId
 228   -- printDebug "NodeTexts: " tId
 229
 230   -- NodeList is second
 231   listId <- getOrMkList userCorpusId userId
 232   -- _cooc  <- insertDefaultNode NodeListCooc listId userId
 233   -- TODO: check if present already, ignore
 234   _ <- Doc.add userCorpusId ids
 235
 236   -- printDebug "Node Text Ids:" tId
 237
 238   -- User List Flow
 239   (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
 240
 241   --let gp = (GroupParams l 2 3 (StopSize 3))
 242   let gp = GroupWithPosTag l CoreNLP HashMap.empty
 243   ngs         <- buildNgramsLists gp user userCorpusId masterCorpusId
 244
 245   _userListId <- flowList_DbRepo listId ngs
 246   _mastListId <- getOrMkList masterCorpusId masterUserId
 247   -- _ <- insertOccsUpdates userCorpusId mastListId
 248   -- printDebug "userListId" userListId
 249   -- User Graph Flow
 250   _ <- insertDefaultNode NodeDashboard userCorpusId userId
 251   _ <- insertDefaultNode NodeGraph     userCorpusId userId
 252   --_ <- mkPhylo  userCorpusId userId
 253   -- Annuaire Flow
 254   -- _ <- mkAnnuaire  rootUserId userId
 255   pure userCorpusId
 256
 257
 258 insertMasterDocs :: ( FlowCmdM env err m
 259                     , FlowCorpus a
 260                     , MkCorpus   c
 261                     )
 262                  => Maybe c
 263                  -> TermType Lang
 264                  -> [a]
 265                  -> m [DocId]
 266 insertMasterDocs c lang hs  =  do
 267   (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
 268   (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
 269   _ <- Doc.add masterCorpusId ids'
 270   -- TODO
 271   -- create a corpus with database name (CSV or PubMed)
 272   -- add documents to the corpus (create node_node link)
 273   -- this will enable global database monitoring
 274
 275   -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
 276   mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
 277                 <- mapNodeIdNgrams
 278                 <$> documentIdWithNgrams
 279                     (extractNgramsT $ withLang lang documentsWithId)
 280                     documentsWithId
 281
 282   terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
 283   let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
 284
 285   -- to be removed
 286   let indexedNgrams = HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
 287
 288   -- new
 289   lId      <- getOrMkList masterCorpusId masterUserId
 290   mapCgramsId <- listInsertDb lId toNodeNgramsW'
 291                $ map (first _ngramsTerms . second Map.keys)
 292                $ HashMap.toList mapNgramsDocs
 293   -- insertDocNgrams
 294   _return <- insertNodeNodeNgrams2
 295            $ catMaybes [ NodeNodeNgrams2 <$> Just nId
 296                                          <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
 297                                          <*> Just (fromIntegral w :: Double)
 298                        | (terms'', mapNgramsTypes)      <- HashMap.toList mapNgramsDocs
 299                        , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
 300                        , (nId, w)                       <- Map.toList mapNodeIdWeight
 301                        ]
 302
 303   -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
 304   -- to be removed
 305   _   <- insertDocNgrams lId indexedNgrams
 306   pure ids'
 307
 308 ------------------------------------------------------------------------
 309 -- TODO Type NodeDocumentUnicised
 310 insertDocs :: ( FlowCmdM env err m
 311               -- , FlowCorpus a
 312               , FlowInsertDB a
 313               )
 314               => UserId
 315               -> CorpusId
 316               -> [a]
 317               -> m ([DocId], [Indexed NodeId a])
 318 insertDocs uId cId hs = do
 319   let docs = map addUniqId hs
 320   newIds <- insertDb uId cId docs
 321   -- printDebug "newIds" newIds
 322   let
 323     newIds' = map reId newIds
 324     documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
 325   _ <- Doc.add cId newIds'
 326   pure (newIds', documentsWithId)
 327
 328
 329 ------------------------------------------------------------------------
 330 viewUniqId' :: UniqId a
 331             => a
 332             -> (Hash, a)
 333 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
 334       where
 335         err = panic "[ERROR] Database.Flow.toInsert"
 336
 337
 338 toInserted :: [ReturnId]
 339            -> Map Hash ReturnId
 340 toInserted =
 341   Map.fromList . map    (\r -> (reUniqId r, r)     )
 342                . filter (\r -> reInserted r == True)
 343
 344 mergeData :: Map Hash ReturnId
 345           -> Map Hash a
 346           -> [Indexed NodeId a]
 347 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
 348   where
 349     toDocumentWithId (sha,hpd) =
 350       Indexed <$> fmap reId (lookup sha rs)
 351               <*> Just hpd
 352
 353 ------------------------------------------------------------------------
 354 ------------------------------------------------------------------------
 355 ------------------------------------------------------------------------
 356 documentIdWithNgrams :: HasNodeError err
 357                      => (a
 358                      -> Cmd err (HashMap b (Map NgramsType Int)))
 359                      -> [Indexed NodeId a]
 360                      -> Cmd err [DocumentIdWithNgrams a b]
 361 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
 362   where
 363     toDocumentIdWithNgrams d = do
 364       e <- f $ _unIndex         d
 365       pure   $ DocumentIdWithNgrams d e
 366
 367
 368 -- | TODO check optimization
 369 mapNodeIdNgrams :: (Ord b, Hashable b)
 370                 => [DocumentIdWithNgrams a b]
 371                 -> HashMap b
 372                        (Map NgramsType
 373                             (Map NodeId Int)
 374                        )
 375 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
 376   where
 377     f :: DocumentIdWithNgrams a b
 378       -> HashMap b (Map NgramsType (Map NodeId Int))
 379     f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
 380       where
 381         nId = _index $ documentWithId d
 382
 383
 384 ------------------------------------------------------------------------
 385 instance ExtractNgramsT HyperdataContact
 386   where
 387     extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
 388       where
 389         extract :: TermType Lang -> HyperdataContact
 390                 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
 391         extract _l hc' = do
 392           let authors = map text2ngrams
 393                       $ maybe ["Nothing"] (\a -> [a])
 394                       $ view (hc_who . _Just . cw_lastName) hc'
 395
 396           pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
 397
 398
 399 instance ExtractNgramsT HyperdataDocument
 400   where
 401     extractNgramsT :: TermType Lang
 402                    -> HyperdataDocument
 403                    -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
 404     extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
 405       where
 406         extractNgramsT' :: TermType Lang
 407                         -> HyperdataDocument
 408                        -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
 409         extractNgramsT' lang' doc = do
 410           let source    = text2ngrams
 411                         $ maybe "Nothing" identity
 412                         $ _hd_source doc
 413
 414               institutes = map text2ngrams
 415                          $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
 416                          $ _hd_institutes doc
 417
 418               authors    = map text2ngrams
 419                          $ maybe ["Nothing"] (splitOn ", ")
 420                          $ _hd_authors doc
 421
 422           terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
 423                  <$> concat
 424                  <$> liftBase (extractTerms lang' $ hasText doc)
 425
 426           pure $ HashMap.fromList
 427                $  [(SimpleNgrams source, Map.singleton Sources     1)                    ]
 428                <> [(SimpleNgrams     i', Map.singleton Institutes  1) | i' <- institutes ]
 429                <> [(SimpleNgrams     a', Map.singleton Authors     1) | a' <- authors    ]
 430                <> [(EnrichedNgrams   t', Map.singleton NgramsTerms 1) | t' <- terms'     ]
 431
 432 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
 433   where
 434     extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
 435
 436 instance HasText a => HasText (Node a)
 437   where
 438     hasText (Node _ _ _ _ _ _ _ h) = hasText h
 439
 440
 441
 442 -- | TODO putelsewhere
 443 -- | Upgrade function
 444 -- Suppose all documents are English (this is the case actually)
 445 indexAllDocumentsWithPosTag :: FlowCmdM env err m => m ()
 446 indexAllDocumentsWithPosTag = do
 447   rootId    <- getRootId (UserName userMaster)
 448   corpusIds <- findNodesId rootId [NodeCorpus]
 449   docs      <- List.concat <$> mapM getDocumentsWithParentId corpusIds
 450
 451   _ <- mapM extractInsert (splitEvery 1000 docs)
 452
 453   pure ()
 454
 455 extractInsert :: FlowCmdM env err m => [Node HyperdataDocument] -> m ()
 456 extractInsert docs = do
 457   let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
 458
 459   mapNgramsDocs' <- mapNodeIdNgrams
 460                 <$> documentIdWithNgrams
 461                     (extractNgramsT $ withLang (Multi EN) documentsWithId)
 462                     documentsWithId
 463
 464   _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
 465
 466   pure ()
 467
 468
 469
 470
 471
 472
 473
 474