src/Gargantext/Database/Action/Flow.hs

   1 {-|
   2 Module      : Gargantext.Database.Flow
   3 Description : Database Flow
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 -- TODO-ACCESS:
  11 --   check userId       CanFillUserCorpus   userCorpusId
  12 --   check masterUserId CanFillMasterCorpus masterCorpusId
  13
  14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
  15 -- TODO-EVENTS: InsertedNodes
  16 -}
  17
  18 {-# OPTIONS_GHC -fno-warn-orphans #-}
  19
  20 {-# LANGUAGE ConstrainedClassMethods #-}
  21 {-# LANGUAGE ConstraintKinds         #-}
  22 {-# LANGUAGE InstanceSigs            #-}
  23 {-# LANGUAGE ScopedTypeVariables     #-}
  24 {-# LANGUAGE TemplateHaskell         #-}
  25
  26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
  27   ( FlowCmdM
  28   , getDataText
  29   , flowDataText
  30   , flow
  31
  32   , flowCorpusFile
  33   , flowCorpus
  34   , flowAnnuaire
  35   , insertMasterDocs
  36
  37   , getOrMkRoot
  38   , getOrMk_RootWithCorpus
  39   , TermType(..)
  40   , DataOrigin(..)
  41   , allDataOrigins
  42
  43   , do_api
  44   , indexAllDocumentsWithPosTag
  45   )
  46     where
  47
  48 import Control.Lens ((^.), view, _Just, makeLenses)
  49 import Data.Aeson.TH (deriveJSON)
  50 import Data.Either
  51 import Data.HashMap.Strict (HashMap)
  52 import Data.Hashable (Hashable)
  53 import Data.List (concat)
  54 import Data.Map (Map, lookup)
  55 import Data.Maybe (catMaybes)
  56 import Data.Monoid
  57 import Data.Swagger
  58 import Data.Text (splitOn)
  59 import Data.Traversable (traverse)
  60 import Data.Tuple.Extra (first, second)
  61 import GHC.Generics (Generic)
  62 import System.FilePath (FilePath)
  63 import qualified Data.HashMap.Strict as HashMap
  64 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
  65 import qualified Data.Map as Map
  66
  67 import Gargantext.Core (Lang(..), PosTagAlgo(..))
  68 import Gargantext.Core.Ext.IMT (toSchoolName)
  69 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
  70 import Gargantext.Core.Flow.Types
  71 import Gargantext.Core.Text
  72 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
  73 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
  74 import Gargantext.Core.Text.List (buildNgramsLists)
  75 import Gargantext.Core.Text.Terms
  76 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
  77 import Gargantext.Core.Types (POS(NP))
  78 import Gargantext.Core.Types.Individu (User(..))
  79 import Gargantext.Core.Types.Main
  80 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
  81 import Gargantext.Database.Action.Flow.List
  82 import Gargantext.Database.Action.Flow.Types
  83 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
  84 import Gargantext.Database.Action.Search (searchDocInDatabase)
  85 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
  86 import Gargantext.Database.Admin.Types.Hyperdata
  87 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
  88 import Gargantext.Database.Prelude
  89 import Gargantext.Database.Query.Table.Ngrams
  90 import Gargantext.Database.Query.Table.Node
  91 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
  92 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
  93 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
  94 import Gargantext.Database.Query.Table.NodeNodeNgrams2
  95 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
  96 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
  97 import Gargantext.Database.Types
  98 import Gargantext.Prelude
  99 import Gargantext.Prelude.Crypto.Hash (Hash)
 100 import qualified Gargantext.Core.Text.Corpus.API as API
 101 import qualified Gargantext.Database.Query.Table.Node.Document.Add  as Doc  (add)
 102
 103 ------------------------------------------------------------------------
 104 -- Impots for upgrade function
 105 import Gargantext.Database.Query.Tree.Root (getRootId)
 106 import Gargantext.Database.Query.Tree (findNodesId)
 107 import qualified Data.List as List
 108 ------------------------------------------------------------------------
 109 -- TODO use internal with API name (could be old data)
 110 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
 111                 | ExternalOrigin { _do_api :: API.ExternalAPIs }
 112                -- TODO Web
 113   deriving (Generic, Eq)
 114
 115 makeLenses ''DataOrigin
 116 deriveJSON (unPrefix "_do_") ''DataOrigin
 117 instance ToSchema DataOrigin where
 118   declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
 119
 120 allDataOrigins :: [DataOrigin]
 121 allDataOrigins = map InternalOrigin API.externalAPIs
 122               <> map ExternalOrigin API.externalAPIs
 123
 124 ---------------
 125 data DataText = DataOld ![NodeId]
 126               | DataNew ![[HyperdataDocument]]
 127
 128 -- TODO use the split parameter in config file
 129 getDataText :: FlowCmdM env err m
 130             => DataOrigin
 131             -> TermType Lang
 132             -> API.Query
 133             -> Maybe API.Limit
 134             -> m DataText
 135 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
 136                                   <$> splitEvery 500
 137                                   <$> API.get api (_tt_lang la) q li
 138
 139 getDataText (InternalOrigin _) _la q _li = do
 140   (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
 141                                            (UserName userMaster)
 142                                            (Left "")
 143                                            (Nothing :: Maybe HyperdataCorpus)
 144   ids <-  map fst <$> searchDocInDatabase cId (stemIt q)
 145   pure $ DataOld ids
 146
 147 -------------------------------------------------------------------------------
 148 flowDataText :: ( FlowCmdM env err m
 149                 )
 150                 => User
 151                 -> DataText
 152                 -> TermType Lang
 153                 -> CorpusId
 154                 -> m CorpusId
 155 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
 156   where
 157     corpusType = (Nothing :: Maybe HyperdataCorpus)
 158 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
 159
 160 ------------------------------------------------------------------------
 161 -- TODO use proxy
 162 flowAnnuaire :: (FlowCmdM env err m)
 163              => User
 164              -> Either CorpusName [CorpusId]
 165              -> (TermType Lang)
 166              -> FilePath
 167              -> m AnnuaireId
 168 flowAnnuaire u n l filePath = do
 169   docs <- liftBase $ (( splitEvery 500 <$> readFile_Annuaire filePath) :: IO [[HyperdataContact]])
 170   flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
 171
 172 ------------------------------------------------------------------------
 173 flowCorpusFile :: (FlowCmdM env err m)
 174            => User
 175            -> Either CorpusName [CorpusId]
 176            -> Limit -- Limit the number of docs (for dev purpose)
 177            -> TermType Lang -> FileFormat -> FilePath
 178            -> m CorpusId
 179 flowCorpusFile u n l la ff fp = do
 180   docs <- liftBase ( splitEvery 500
 181                  <$> take l
 182                  <$> parseFile ff fp
 183                  )
 184   flowCorpus u n la (map (map toHyperdataDocument) docs)
 185
 186 ------------------------------------------------------------------------
 187 -- | TODO improve the needed type to create/update a corpus
 188 -- (For now, Either is enough)
 189 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
 190            => User
 191            -> Either CorpusName [CorpusId]
 192            -> TermType Lang
 193            -> [[a]]
 194            -> m CorpusId
 195 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
 196
 197
 198 flow :: ( FlowCmdM env err m
 199         , FlowCorpus a
 200         , MkCorpus c
 201         )
 202         => Maybe c
 203         -> User
 204         -> Either CorpusName [CorpusId]
 205         -> TermType Lang
 206         -> [[a]]
 207         -> m CorpusId
 208 flow c u cn la docs = do
 209   -- TODO if public insertMasterDocs else insertUserDocs
 210   ids <- traverse (insertMasterDocs c la) docs
 211   flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
 212
 213 ------------------------------------------------------------------------
 214 flowCorpusUser :: ( FlowCmdM env err m
 215                   , MkCorpus c
 216                   )
 217                => Lang
 218                -> User
 219                -> Either CorpusName [CorpusId]
 220                -> Maybe c
 221                -> [NodeId]
 222                -> m CorpusId
 223 flowCorpusUser l user corpusName ctype ids = do
 224   -- User Flow
 225   (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
 226   -- NodeTexts is first
 227   _tId <- insertDefaultNode NodeTexts userCorpusId userId
 228   -- printDebug "NodeTexts: " tId
 229
 230   -- NodeList is second
 231   listId <- getOrMkList userCorpusId userId
 232   -- _cooc  <- insertDefaultNode NodeListCooc listId userId
 233   -- TODO: check if present already, ignore
 234   _ <- Doc.add userCorpusId ids
 235
 236   -- printDebug "Node Text Ids:" tId
 237
 238   -- User List Flow
 239   (masterUserId, _masterRootId, masterCorpusId)
 240     <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
 241
 242   --let gp = (GroupParams l 2 3 (StopSize 3))
 243   let gp = GroupWithPosTag l CoreNLP HashMap.empty
 244   ngs         <- buildNgramsLists gp user userCorpusId masterCorpusId
 245
 246   _userListId <- flowList_DbRepo listId ngs
 247   _mastListId <- getOrMkList masterCorpusId masterUserId
 248   -- _ <- insertOccsUpdates userCorpusId mastListId
 249   -- printDebug "userListId" userListId
 250   -- User Graph Flow
 251   _ <- insertDefaultNode NodeDashboard userCorpusId userId
 252   _ <- insertDefaultNode NodeGraph     userCorpusId userId
 253   --_ <- mkPhylo  userCorpusId userId
 254   -- Annuaire Flow
 255   -- _ <- mkAnnuaire  rootUserId userId
 256   pure userCorpusId
 257
 258
 259 insertMasterDocs :: ( FlowCmdM env err m
 260                     , FlowCorpus a
 261                     , MkCorpus   c
 262                     )
 263                  => Maybe c
 264                  -> TermType Lang
 265                  -> [a]
 266                  -> m [DocId]
 267 insertMasterDocs c lang hs  =  do
 268   (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
 269   (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
 270   _ <- Doc.add masterCorpusId ids'
 271   -- TODO
 272   -- create a corpus with database name (CSV or PubMed)
 273   -- add documents to the corpus (create node_node link)
 274   -- this will enable global database monitoring
 275
 276   -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
 277   mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
 278                 <- mapNodeIdNgrams
 279                 <$> documentIdWithNgrams
 280                     (extractNgramsT $ withLang lang documentsWithId)
 281                     documentsWithId
 282
 283   terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
 284   let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
 285
 286   -- to be removed
 287   let indexedNgrams = HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
 288
 289   -- new
 290   lId      <- getOrMkList masterCorpusId masterUserId
 291   mapCgramsId <- listInsertDb lId toNodeNgramsW'
 292                $ map (first _ngramsTerms . second Map.keys)
 293                $ HashMap.toList mapNgramsDocs
 294   -- insertDocNgrams
 295   _return <- insertNodeNodeNgrams2
 296            $ catMaybes [ NodeNodeNgrams2 <$> Just nId
 297                                          <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
 298                                          <*> Just (fromIntegral w :: Double)
 299                        | (terms'', mapNgramsTypes)      <- HashMap.toList mapNgramsDocs
 300                        , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
 301                        , (nId, w)                       <- Map.toList mapNodeIdWeight
 302                        ]
 303
 304   -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
 305   -- to be removed
 306   _   <- insertDocNgrams lId indexedNgrams
 307   pure ids'
 308
 309 ------------------------------------------------------------------------
 310 -- TODO Type NodeDocumentUnicised
 311 insertDocs :: ( FlowCmdM env err m
 312               -- , FlowCorpus a
 313               , FlowInsertDB a
 314               )
 315               => UserId
 316               -> CorpusId
 317               -> [a]
 318               -> m ([DocId], [Indexed NodeId a])
 319 insertDocs uId cId hs = do
 320   let docs = map addUniqId hs
 321   newIds <- insertDb uId cId docs
 322   -- printDebug "newIds" newIds
 323   let
 324     newIds' = map reId newIds
 325     documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
 326   _ <- Doc.add cId newIds'
 327   pure (newIds', documentsWithId)
 328
 329
 330 ------------------------------------------------------------------------
 331 viewUniqId' :: UniqId a
 332             => a
 333             -> (Hash, a)
 334 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
 335       where
 336         err = panic "[ERROR] Database.Flow.toInsert"
 337
 338
 339 toInserted :: [ReturnId]
 340            -> Map Hash ReturnId
 341 toInserted =
 342   Map.fromList . map    (\r -> (reUniqId r, r)     )
 343                . filter (\r -> reInserted r == True)
 344
 345 mergeData :: Map Hash ReturnId
 346           -> Map Hash a
 347           -> [Indexed NodeId a]
 348 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
 349   where
 350     toDocumentWithId (sha,hpd) =
 351       Indexed <$> fmap reId (lookup sha rs)
 352               <*> Just hpd
 353
 354 ------------------------------------------------------------------------
 355 ------------------------------------------------------------------------
 356 ------------------------------------------------------------------------
 357 documentIdWithNgrams :: HasNodeError err
 358                      => (a
 359                      -> Cmd err (HashMap b (Map NgramsType Int)))
 360                      -> [Indexed NodeId a]
 361                      -> Cmd err [DocumentIdWithNgrams a b]
 362 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
 363   where
 364     toDocumentIdWithNgrams d = do
 365       e <- f $ _unIndex         d
 366       pure   $ DocumentIdWithNgrams d e
 367
 368
 369 -- | TODO check optimization
 370 mapNodeIdNgrams :: (Ord b, Hashable b)
 371                 => [DocumentIdWithNgrams a b]
 372                 -> HashMap b
 373                        (Map NgramsType
 374                             (Map NodeId Int)
 375                        )
 376 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
 377   where
 378     f :: DocumentIdWithNgrams a b
 379       -> HashMap b (Map NgramsType (Map NodeId Int))
 380     f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
 381       where
 382         nId = _index $ documentWithId d
 383
 384
 385 ------------------------------------------------------------------------
 386 instance ExtractNgramsT HyperdataContact
 387   where
 388     extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
 389       where
 390         extract :: TermType Lang -> HyperdataContact
 391                 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
 392         extract _l hc' = do
 393           let authors = map text2ngrams
 394                       $ maybe ["Nothing"] (\a -> [a])
 395                       $ view (hc_who . _Just . cw_lastName) hc'
 396
 397           pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
 398
 399
 400 instance ExtractNgramsT HyperdataDocument
 401   where
 402     extractNgramsT :: TermType Lang
 403                    -> HyperdataDocument
 404                    -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
 405     extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
 406       where
 407         extractNgramsT' :: TermType Lang
 408                         -> HyperdataDocument
 409                        -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
 410         extractNgramsT' lang' doc = do
 411           let source    = text2ngrams
 412                         $ maybe "Nothing" identity
 413                         $ _hd_source doc
 414
 415               institutes = map text2ngrams
 416                          $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
 417                          $ _hd_institutes doc
 418
 419               authors    = map text2ngrams
 420                          $ maybe ["Nothing"] (splitOn ", ")
 421                          $ _hd_authors doc
 422
 423           terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
 424                  <$> concat
 425                  <$> liftBase (extractTerms lang' $ hasText doc)
 426
 427           pure $ HashMap.fromList
 428                $  [(SimpleNgrams source, Map.singleton Sources     1)                    ]
 429                <> [(SimpleNgrams     i', Map.singleton Institutes  1) | i' <- institutes ]
 430                <> [(SimpleNgrams     a', Map.singleton Authors     1) | a' <- authors    ]
 431                <> [(EnrichedNgrams   t', Map.singleton NgramsTerms 1) | t' <- terms'     ]
 432
 433 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
 434   where
 435     extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
 436
 437 instance HasText a => HasText (Node a)
 438   where
 439     hasText (Node _ _ _ _ _ _ _ h) = hasText h
 440
 441
 442
 443 -- | TODO putelsewhere
 444 -- | Upgrade function
 445 -- Suppose all documents are English (this is the case actually)
 446 indexAllDocumentsWithPosTag :: FlowCmdM env err m => m ()
 447 indexAllDocumentsWithPosTag = do
 448   rootId    <- getRootId (UserName userMaster)
 449   corpusIds <- findNodesId rootId [NodeCorpus]
 450   docs      <- List.concat <$> mapM getDocumentsWithParentId corpusIds
 451
 452   _ <- mapM extractInsert (splitEvery 1000 docs)
 453
 454   pure ()
 455
 456 extractInsert :: FlowCmdM env err m => [Node HyperdataDocument] -> m ()
 457 extractInsert docs = do
 458   let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
 459
 460   mapNgramsDocs' <- mapNodeIdNgrams
 461                 <$> documentIdWithNgrams
 462                     (extractNgramsT $ withLang (Multi EN) documentsWithId)
 463                     documentsWithId
 464
 465   _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
 466
 467   pure ()
 468
 469
 470
 471
 472
 473
 474
 475