src/Gargantext/Database/Action/Flow.hs

   1 {-|
   2 Module      : Gargantext.Database.Flow
   3 Description : Database Flow
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 -- TODO-ACCESS:
  11 --   check userId       CanFillUserCorpus   userCorpusId
  12 --   check masterUserId CanFillMasterCorpus masterCorpusId
  13
  14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
  15 -- TODO-EVENTS: InsertedNodes
  16 -}
  17
  18 {-# OPTIONS_GHC -fno-warn-orphans #-}
  19
  20 {-# LANGUAGE ConstrainedClassMethods #-}
  21 {-# LANGUAGE ConstraintKinds         #-}
  22 {-# LANGUAGE InstanceSigs            #-}
  23 {-# LANGUAGE ScopedTypeVariables     #-}
  24 {-# LANGUAGE TemplateHaskell         #-}
  25
  26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
  27   ( FlowCmdM
  28   , getDataText
  29   , flowDataText
  30   , flow
  31
  32   , flowCorpusFile
  33   , flowCorpus
  34   , flowAnnuaire
  35   , insertMasterDocs
  36   , saveDocNgramsWith
  37
  38   , getOrMkRoot
  39   , getOrMk_RootWithCorpus
  40   , TermType(..)
  41   , DataOrigin(..)
  42   , allDataOrigins
  43
  44   , do_api
  45   , indexAllDocumentsWithPosTag
  46   )
  47     where
  48
  49 import Control.Lens ((^.), view, _Just, makeLenses)
  50 import Data.Aeson.TH (deriveJSON)
  51 import Data.Either
  52 import Data.HashMap.Strict (HashMap)
  53 import Data.Hashable (Hashable)
  54 import Data.List (concat)
  55 import Data.Map (Map, lookup)
  56 import Data.Maybe (catMaybes)
  57 import Data.Monoid
  58 import Data.Swagger
  59 import Data.Text (splitOn)
  60 import Data.Traversable (traverse)
  61 import Data.Tuple.Extra (first, second)
  62 import GHC.Generics (Generic)
  63 import System.FilePath (FilePath)
  64 import qualified Data.HashMap.Strict as HashMap
  65 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
  66 import qualified Data.Map as Map
  67
  68 import Gargantext.Core (Lang(..), PosTagAlgo(..))
  69 import Gargantext.Core.Ext.IMT (toSchoolName)
  70 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
  71 import Gargantext.Core.Flow.Types
  72 import Gargantext.Core.Text
  73 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
  74 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
  75 import Gargantext.Core.Text.List (buildNgramsLists)
  76 import Gargantext.Core.Text.Terms
  77 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
  78 import Gargantext.Core.Types (POS(NP))
  79 import Gargantext.Core.Types.Individu (User(..))
  80 import Gargantext.Core.Types.Main
  81 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
  82 import Gargantext.Database.Action.Flow.List
  83 import Gargantext.Database.Action.Flow.Types
  84 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
  85 import Gargantext.Database.Action.Search (searchDocInDatabase)
  86 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
  87 import Gargantext.Database.Admin.Types.Hyperdata
  88 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
  89 import Gargantext.Database.Prelude
  90 import Gargantext.Database.Query.Table.Ngrams
  91 import Gargantext.Database.Query.Table.Node
  92 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
  93 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
  94 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
  95 import Gargantext.Database.Query.Table.NodeNodeNgrams2
  96 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
  97 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
  98 import Gargantext.Database.Types
  99 import Gargantext.Prelude
 100 import Gargantext.Prelude.Crypto.Hash (Hash)
 101 import qualified Gargantext.Core.Text.Corpus.API as API
 102 import qualified Gargantext.Database.Query.Table.Node.Document.Add  as Doc  (add)
 103
 104 ------------------------------------------------------------------------
 105 -- Impots for upgrade function
 106 import Gargantext.Database.Query.Tree.Root (getRootId)
 107 import Gargantext.Database.Query.Tree (findNodesId)
 108 import qualified Data.List as List
 109 ------------------------------------------------------------------------
 110 -- TODO use internal with API name (could be old data)
 111 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
 112                 | ExternalOrigin { _do_api :: API.ExternalAPIs }
 113                -- TODO Web
 114   deriving (Generic, Eq)
 115
 116 makeLenses ''DataOrigin
 117 deriveJSON (unPrefix "_do_") ''DataOrigin
 118 instance ToSchema DataOrigin where
 119   declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
 120
 121 allDataOrigins :: [DataOrigin]
 122 allDataOrigins = map InternalOrigin API.externalAPIs
 123               <> map ExternalOrigin API.externalAPIs
 124
 125 ---------------
 126 data DataText = DataOld ![NodeId]
 127               | DataNew ![[HyperdataDocument]]
 128
 129 -- TODO use the split parameter in config file
 130 getDataText :: FlowCmdM env err m
 131             => DataOrigin
 132             -> TermType Lang
 133             -> API.Query
 134             -> Maybe API.Limit
 135             -> m DataText
 136 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
 137                                   <$> splitEvery 500
 138                                   <$> API.get api (_tt_lang la) q li
 139
 140 getDataText (InternalOrigin _) _la q _li = do
 141   (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
 142                                            (UserName userMaster)
 143                                            (Left "")
 144                                            (Nothing :: Maybe HyperdataCorpus)
 145   ids <-  map fst <$> searchDocInDatabase cId (stemIt q)
 146   pure $ DataOld ids
 147
 148 -------------------------------------------------------------------------------
 149 flowDataText :: ( FlowCmdM env err m
 150                 )
 151                 => User
 152                 -> DataText
 153                 -> TermType Lang
 154                 -> CorpusId
 155                 -> m CorpusId
 156 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
 157   where
 158     corpusType = (Nothing :: Maybe HyperdataCorpus)
 159 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
 160
 161 ------------------------------------------------------------------------
 162 -- TODO use proxy
 163 flowAnnuaire :: (FlowCmdM env err m)
 164              => User
 165              -> Either CorpusName [CorpusId]
 166              -> (TermType Lang)
 167              -> FilePath
 168              -> m AnnuaireId
 169 flowAnnuaire u n l filePath = do
 170   docs <- liftBase $ (( splitEvery 500 <$> readFile_Annuaire filePath) :: IO [[HyperdataContact]])
 171   flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
 172
 173 ------------------------------------------------------------------------
 174 flowCorpusFile :: (FlowCmdM env err m)
 175            => User
 176            -> Either CorpusName [CorpusId]
 177            -> Limit -- Limit the number of docs (for dev purpose)
 178            -> TermType Lang -> FileFormat -> FilePath
 179            -> m CorpusId
 180 flowCorpusFile u n l la ff fp = do
 181   docs <- liftBase ( splitEvery 500
 182                  <$> take l
 183                  <$> parseFile ff fp
 184                  )
 185   flowCorpus u n la (map (map toHyperdataDocument) docs)
 186
 187 ------------------------------------------------------------------------
 188 -- | TODO improve the needed type to create/update a corpus
 189 -- (For now, Either is enough)
 190 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
 191            => User
 192            -> Either CorpusName [CorpusId]
 193            -> TermType Lang
 194            -> [[a]]
 195            -> m CorpusId
 196 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
 197
 198
 199 flow :: ( FlowCmdM env err m
 200         , FlowCorpus a
 201         , MkCorpus c
 202         )
 203         => Maybe c
 204         -> User
 205         -> Either CorpusName [CorpusId]
 206         -> TermType Lang
 207         -> [[a]]
 208         -> m CorpusId
 209 flow c u cn la docs = do
 210   -- TODO if public insertMasterDocs else insertUserDocs
 211   ids <- traverse (insertMasterDocs c la) docs
 212   flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
 213
 214 ------------------------------------------------------------------------
 215 flowCorpusUser :: ( FlowCmdM env err m
 216                   , MkCorpus c
 217                   )
 218                => Lang
 219                -> User
 220                -> Either CorpusName [CorpusId]
 221                -> Maybe c
 222                -> [NodeId]
 223                -> m CorpusId
 224 flowCorpusUser l user corpusName ctype ids = do
 225   -- User Flow
 226   (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
 227   -- NodeTexts is first
 228   _tId <- insertDefaultNode NodeTexts userCorpusId userId
 229   -- printDebug "NodeTexts: " tId
 230
 231   -- NodeList is second
 232   listId <- getOrMkList userCorpusId userId
 233   -- _cooc  <- insertDefaultNode NodeListCooc listId userId
 234   -- TODO: check if present already, ignore
 235   _ <- Doc.add userCorpusId ids
 236
 237   -- printDebug "Node Text Ids:" tId
 238
 239   -- User List Flow
 240   (masterUserId, _masterRootId, masterCorpusId)
 241     <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
 242
 243   --let gp = (GroupParams l 2 3 (StopSize 3))
 244   let gp = GroupWithPosTag l CoreNLP HashMap.empty
 245   ngs         <- buildNgramsLists gp user userCorpusId masterCorpusId
 246
 247   _userListId <- flowList_DbRepo listId ngs
 248   _mastListId <- getOrMkList masterCorpusId masterUserId
 249   -- _ <- insertOccsUpdates userCorpusId mastListId
 250   -- printDebug "userListId" userListId
 251   -- User Graph Flow
 252   _ <- insertDefaultNode NodeDashboard userCorpusId userId
 253   _ <- insertDefaultNode NodeGraph     userCorpusId userId
 254   --_ <- mkPhylo  userCorpusId userId
 255   -- Annuaire Flow
 256   -- _ <- mkAnnuaire  rootUserId userId
 257   pure userCorpusId
 258
 259
 260 insertMasterDocs :: ( FlowCmdM env err m
 261                     , FlowCorpus a
 262                     , MkCorpus   c
 263                     )
 264                  => Maybe c
 265                  -> TermType Lang
 266                  -> [a]
 267                  -> m [DocId]
 268 insertMasterDocs c lang hs  =  do
 269   (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
 270   (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
 271   _ <- Doc.add masterCorpusId ids'
 272   -- TODO
 273   -- create a corpus with database name (CSV or PubMed)
 274   -- add documents to the corpus (create node_node link)
 275   -- this will enable global database monitoring
 276
 277   -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
 278   mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
 279                 <- mapNodeIdNgrams
 280                 <$> documentIdWithNgrams
 281                     (extractNgramsT $ withLang lang documentsWithId)
 282                     documentsWithId
 283
 284   lId      <- getOrMkList masterCorpusId masterUserId
 285   _ <- saveDocNgramsWith lId mapNgramsDocs'
 286
 287   -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
 288   pure ids'
 289
 290 saveDocNgramsWith :: ( FlowCmdM env err m)
 291                   => ListId
 292                   -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
 293                   -> m ()
 294 saveDocNgramsWith lId mapNgramsDocs' = do
 295   terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
 296   let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
 297
 298   -- to be removed
 299   let indexedNgrams = HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
 300
 301   -- new
 302   mapCgramsId <- listInsertDb lId toNodeNgramsW'
 303                $ map (first _ngramsTerms . second Map.keys)
 304                $ HashMap.toList mapNgramsDocs
 305
 306   -- insertDocNgrams
 307   _return <- insertNodeNodeNgrams2
 308            $ catMaybes [ NodeNodeNgrams2 <$> Just nId
 309                                          <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
 310                                          <*> Just (fromIntegral w :: Double)
 311                        | (terms'', mapNgramsTypes)      <- HashMap.toList mapNgramsDocs
 312                        , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
 313                        , (nId, w)                       <- Map.toList mapNodeIdWeight
 314                        ]
 315   -- to be removed
 316   _   <- insertDocNgrams lId indexedNgrams
 317
 318   pure ()
 319
 320
 321 ------------------------------------------------------------------------
 322 -- TODO Type NodeDocumentUnicised
 323 insertDocs :: ( FlowCmdM env err m
 324               -- , FlowCorpus a
 325               , FlowInsertDB a
 326               )
 327               => UserId
 328               -> CorpusId
 329               -> [a]
 330               -> m ([DocId], [Indexed NodeId a])
 331 insertDocs uId cId hs = do
 332   let docs = map addUniqId hs
 333   newIds <- insertDb uId cId docs
 334   -- printDebug "newIds" newIds
 335   let
 336     newIds' = map reId newIds
 337     documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
 338   _ <- Doc.add cId newIds'
 339   pure (newIds', documentsWithId)
 340
 341
 342 ------------------------------------------------------------------------
 343 viewUniqId' :: UniqId a
 344             => a
 345             -> (Hash, a)
 346 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
 347       where
 348         err = panic "[ERROR] Database.Flow.toInsert"
 349
 350
 351 toInserted :: [ReturnId]
 352            -> Map Hash ReturnId
 353 toInserted =
 354   Map.fromList . map    (\r -> (reUniqId r, r)     )
 355                . filter (\r -> reInserted r == True)
 356
 357 mergeData :: Map Hash ReturnId
 358           -> Map Hash a
 359           -> [Indexed NodeId a]
 360 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
 361   where
 362     toDocumentWithId (sha,hpd) =
 363       Indexed <$> fmap reId (lookup sha rs)
 364               <*> Just hpd
 365
 366 ------------------------------------------------------------------------
 367 ------------------------------------------------------------------------
 368 ------------------------------------------------------------------------
 369 documentIdWithNgrams :: HasNodeError err
 370                      => (a
 371                      -> Cmd err (HashMap b (Map NgramsType Int)))
 372                      -> [Indexed NodeId a]
 373                      -> Cmd err [DocumentIdWithNgrams a b]
 374 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
 375   where
 376     toDocumentIdWithNgrams d = do
 377       e <- f $ _unIndex         d
 378       pure   $ DocumentIdWithNgrams d e
 379
 380
 381 -- | TODO check optimization
 382 mapNodeIdNgrams :: (Ord b, Hashable b)
 383                 => [DocumentIdWithNgrams a b]
 384                 -> HashMap b
 385                        (Map NgramsType
 386                             (Map NodeId Int)
 387                        )
 388 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
 389   where
 390     f :: DocumentIdWithNgrams a b
 391       -> HashMap b (Map NgramsType (Map NodeId Int))
 392     f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
 393       where
 394         nId = _index $ documentWithId d
 395
 396
 397 ------------------------------------------------------------------------
 398 instance ExtractNgramsT HyperdataContact
 399   where
 400     extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
 401       where
 402         extract :: TermType Lang -> HyperdataContact
 403                 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
 404         extract _l hc' = do
 405           let authors = map text2ngrams
 406                       $ maybe ["Nothing"] (\a -> [a])
 407                       $ view (hc_who . _Just . cw_lastName) hc'
 408
 409           pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
 410
 411
 412 instance ExtractNgramsT HyperdataDocument
 413   where
 414     extractNgramsT :: TermType Lang
 415                    -> HyperdataDocument
 416                    -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
 417     extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
 418       where
 419         extractNgramsT' :: TermType Lang
 420                         -> HyperdataDocument
 421                        -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
 422         extractNgramsT' lang' doc = do
 423           let source    = text2ngrams
 424                         $ maybe "Nothing" identity
 425                         $ _hd_source doc
 426
 427               institutes = map text2ngrams
 428                          $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
 429                          $ _hd_institutes doc
 430
 431               authors    = map text2ngrams
 432                          $ maybe ["Nothing"] (splitOn ", ")
 433                          $ _hd_authors doc
 434
 435           terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
 436                  <$> concat
 437                  <$> liftBase (extractTerms lang' $ hasText doc)
 438
 439           pure $ HashMap.fromList
 440                $  [(SimpleNgrams source, Map.singleton Sources     1)                    ]
 441                <> [(SimpleNgrams     i', Map.singleton Institutes  1) | i' <- institutes ]
 442                <> [(SimpleNgrams     a', Map.singleton Authors     1) | a' <- authors    ]
 443                <> [(EnrichedNgrams   t', Map.singleton NgramsTerms 1) | t' <- terms'     ]
 444
 445 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
 446   where
 447     extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
 448
 449 instance HasText a => HasText (Node a)
 450   where
 451     hasText (Node _ _ _ _ _ _ _ h) = hasText h
 452
 453
 454
 455 -- | TODO putelsewhere
 456 -- | Upgrade function
 457 -- Suppose all documents are English (this is the case actually)
 458 indexAllDocumentsWithPosTag :: FlowCmdM env err m => m ()
 459 indexAllDocumentsWithPosTag = do
 460   rootId    <- getRootId (UserName userMaster)
 461   corpusIds <- findNodesId rootId [NodeCorpus]
 462   docs      <- List.concat <$> mapM getDocumentsWithParentId corpusIds
 463
 464   _ <- mapM extractInsert (splitEvery 1000 docs)
 465
 466   pure ()
 467
 468 extractInsert :: FlowCmdM env err m => [Node HyperdataDocument] -> m ()
 469 extractInsert docs = do
 470   let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
 471
 472   mapNgramsDocs' <- mapNodeIdNgrams
 473                 <$> documentIdWithNgrams
 474                     (extractNgramsT $ withLang (Multi EN) documentsWithId)
 475                     documentsWithId
 476
 477   _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
 478
 479   pure ()
 480
 481