src/Gargantext/Database/Flow.hs

   1 {-|
   2 Module      : Gargantext.Database.Flow
   3 Description : Database Flow
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10
  11 -}
  12
  13 {-# LANGUAGE DeriveGeneric     #-}
  14 {-# LANGUAGE NoImplicitPrelude #-}
  15 {-# LANGUAGE OverloadedStrings #-}
  16
  17 module Gargantext.Database.Flow (flowDatabase, ngrams2list)
  18     where
  19
  20 import GHC.Show (Show)
  21 import System.FilePath (FilePath)
  22 import Data.Maybe (Maybe(..), catMaybes)
  23 import Data.Text (Text, splitOn)
  24 import Data.Map (Map, lookup)
  25 import Data.Tuple.Extra (both, second)
  26 import qualified Data.Map as DM
  27
  28 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId)
  29 import Gargantext.Database.Bashql (runCmd') -- , del)
  30 import Gargantext.Database.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, ngramsTypeId, NgramsType(..), text2ngrams)
  31 import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus, Cmd(..), mkList, mkGraph, mkDashboard, mkAnnuaire)
  32 import Gargantext.Database.Node.Document.Add    (add)
  33 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIds)
  34 import Gargantext.Database.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
  35 import Gargantext.Database.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
  36 import Gargantext.Database.Types.Node (HyperdataDocument(..))
  37 import Gargantext.Database.User (getUser, UserLight(..), Username)
  38 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
  39 import Gargantext.Prelude
  40 import Gargantext.Text.Parsers (parseDocs, FileFormat)
  41 import Gargantext.Ext.IMT (toSchoolName)
  42
  43 type UserId = Int
  44 type RootId = Int
  45 type CorpusId = Int
  46
  47 flowDatabase :: FileFormat -> FilePath -> CorpusName -> IO Int
  48 flowDatabase ff fp cName = do
  49
  50   -- Corus Flow
  51   (masterUserId, _, corpusId) <- subFlowCorpus userMaster corpusMasterName
  52
  53   -- Documents Flow
  54   hyperdataDocuments <- map addUniqIds <$> parseDocs ff fp
  55
  56   printDebug "hyperdataDocuments" hyperdataDocuments
  57
  58   ids  <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments
  59   --printDebug "Docs IDs : " (ids)
  60   idsRepeat  <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments
  61   printDebug "Repeated Docs IDs : " (length idsRepeat)
  62
  63   -- Ngrams Flow
  64   -- todo: flow for new documents only
  65   let tids = toInserted ids
  66   printDebug "toInserted ids" (length tids, tids)
  67
  68   let tihs = toInsert hyperdataDocuments
  69   printDebug "toInsert hyperdataDocuments" (length tihs, tihs)
  70
  71   let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
  72   printDebug "documentsWithId" documentsWithId
  73
  74   -- docsWithNgrams <- documentIdWithNgrams documentsWithId extractNgramsT
  75   let docsWithNgrams  = documentIdWithNgrams extractNgramsT documentsWithId
  76   printDebug "docsWithNgrams" docsWithNgrams
  77
  78   let maps            = mapNodeIdNgrams docsWithNgrams
  79   printDebug "maps" (maps)
  80
  81   indexedNgrams <- runCmd' $ indexNgrams maps
  82   printDebug "inserted ngrams" indexedNgrams
  83   _             <- runCmd' $ insertToNodeNgrams indexedNgrams
  84
  85   -- List Flow
  86   listId2 <- runCmd' $ listFlow masterUserId corpusId indexedNgrams
  87   printDebug "list id : " listId2
  88
  89   (userId, rootUserId, corpusId2) <- subFlowCorpus userArbitrary cName
  90
  91   userListId <- runCmd' $ listFlowUser userId corpusId2
  92   printDebug "UserList : " userListId
  93
  94   inserted <- runCmd' $ add corpusId2 (map reId ids)
  95   printDebug "Inserted : " (length inserted)
  96
  97   _ <- runCmd' $ mkDashboard corpusId2 userId
  98   _ <- runCmd' $ mkGraph     corpusId2 userId
  99
 100   -- Annuaire Flow
 101   annuaireId <- runCmd' $ mkAnnuaire  rootUserId userId
 102
 103   pure corpusId2
 104   -- runCmd' $ del [corpusId2, corpusId]
 105
 106 type CorpusName = Text
 107
 108 subFlowCorpus :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
 109 subFlowCorpus username cName = do
 110   maybeUserId <- runCmd' (getUser username)
 111
 112   let userId = case maybeUserId of
 113         Nothing   -> panic "Error: User does not exist (yet)"
 114         -- mk NodeUser gargantua_id "Node Gargantua"
 115         Just user -> userLight_id user
 116
 117   rootId' <- map _node_id <$> runCmd' (getRoot userId)
 118
 119   rootId'' <- case rootId' of
 120         []  -> runCmd' (mkRoot username userId)
 121         n   -> case length n >= 2 of
 122             True  -> panic "Error: more than 1 userNode / user"
 123             False -> pure rootId'
 124   let rootId = maybe (panic "error rootId") identity (head rootId'')
 125
 126   corpusId' <- runCmd' $ mkCorpus (Just cName) Nothing rootId userId
 127   let corpusId = maybe (panic "error corpusId") identity (head corpusId')
 128
 129   printDebug "(username, userId, rootId, corpusId)"
 130               (username, userId, rootId, corpusId)
 131   pure (userId, rootId, corpusId)
 132
 133 ------------------------------------------------------------------------
 134
 135 type HashId   = Text
 136 type NodeId   = Int
 137 type ListId   = Int
 138
 139 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
 140 toInsert = DM.fromList . map (\d -> (hash (_hyperdataDocument_uniqId d), d))
 141   where
 142     hash = maybe "Error" identity
 143
 144 toInserted :: [ReturnId] -> Map HashId ReturnId
 145 toInserted rs = DM.fromList $ map    (\r ->  (reUniqId r, r)    )
 146                             $ filter (\r -> reInserted r == True) rs
 147
 148 data DocumentWithId =
 149      DocumentWithId { documentId   :: NodeId
 150                     , documentData :: HyperdataDocument
 151                     } deriving (Show)
 152
 153 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
 154 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
 155   where
 156     toDocumentWithId (hash,hpd) =
 157       DocumentWithId <$> fmap reId (lookup hash rs)
 158                      <*> Just hpd
 159
 160 ------------------------------------------------------------------------
 161
 162 data DocumentIdWithNgrams =
 163      DocumentIdWithNgrams
 164      { documentWithId  :: DocumentWithId
 165      , document_ngrams :: Map (NgramsT Ngrams) Int
 166      } deriving (Show)
 167
 168 -- TODO add Terms (Title + Abstract)
 169 -- add f :: Text -> Text
 170 -- newtype Ngrams = Ngrams Text
 171 extractNgramsT :: HyperdataDocument -> Map (NgramsT Ngrams) Int
 172 extractNgramsT doc = DM.fromList $  [(NgramsT Sources source, 1)]
 173                                  <> [(NgramsT Institutes i' , 1)| i' <- institutes ]
 174                                  <> [(NgramsT Authors    a' , 1)| a' <- authors    ]
 175   where
 176     source    = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
 177     institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))  $ _hyperdataDocument_institutes doc
 178     authors    = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
 179     -- TODO group terms
 180
 181 documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
 182                      -> [DocumentWithId]   -> [DocumentIdWithNgrams]
 183 documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
 184
 185 -- | TODO check optimization
 186 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
 187 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
 188   where
 189     xs  = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
 190     n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
 191
 192 indexNgrams :: Map (NgramsT Ngrams       ) (Map NodeId Int)
 193        -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
 194 indexNgrams ng2nId = do
 195   terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
 196   pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
 197
 198
 199 insertToNodeNgrams :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd Int
 200 insertToNodeNgrams m = insertNodeNgrams [ NodeNgram Nothing nId  ((_ngramsId    . _ngramsT   ) ng)
 201                                                 (fromIntegral n) ((ngramsTypeId . _ngramsType) ng)
 202                                           | (ng, nId2int) <- DM.toList m
 203                                           , (nId, n)      <- DM.toList nId2int
 204                                         ]
 205
 206
 207 ------------------------------------------------------------------------
 208 ------------------------------------------------------------------------
 209 listFlow :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
 210 listFlow uId cId ngs = do
 211   printDebug "ngs:" ngs
 212   lId <- maybe (panic "mkList error") identity <$> head <$> mkList cId uId
 213   printDebug "ngs" (DM.keys ngs)
 214   -- TODO add stemming equivalence of 2 ngrams
 215   let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
 216   _ <- insertGroups lId groupEd
 217
 218 -- compute Candidate / Map
 219   let lists = ngrams2list ngs
 220   printDebug "lists:" lists
 221
 222   is <- insertLists lId lists
 223   printDebug "listNgrams inserted :" is
 224
 225   pure lId
 226
 227 listFlowUser :: UserId -> CorpusId -> Cmd [Int]
 228 listFlowUser uId cId = mkList cId uId
 229
 230 ------------------------------------------------------------------------
 231
 232 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
 233               -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
 234               -> Map NgramsIndexed NgramsIndexed
 235 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
 236
 237
 238
 239 -- TODO check: do not insert duplicates
 240 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
 241 insertGroups lId ngrs =
 242   insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
 243                               | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
 244                               , ng1 /= ng2
 245                             ]
 246
 247 ------------------------------------------------------------------------
 248 -- TODO: verify NgramsT lost here
 249 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> [(ListType,NgramsIndexed)]
 250 ngrams2list = zip (repeat CandidateList) . map (\(NgramsT _lost_t ng) -> ng) . DM.keys
 251
 252 -- | TODO: weight of the list could be a probability
 253 insertLists :: ListId -> [(ListType,NgramsIndexed)] -> Cmd Int
 254 insertLists lId lngs =
 255   insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listTypeId l) (listTypeId l)
 256                      | (l,ngr) <- map (second _ngramsId) lngs
 257                    ]
 258
 259 ------------------------------------------------------------------------
 260 ------------------------------------------------------------------------
 261