2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE DeriveGeneric #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
16 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
19 --import Control.Lens (view)
20 --import Gargantext.Core.Types
21 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
22 import Data.Map (Map, lookup)
23 import Data.Maybe (Maybe(..), catMaybes)
24 import Data.Text (Text, splitOn, intercalate)
25 import Data.Tuple.Extra (both, second)
26 import Data.List (concat)
27 import GHC.Show (Show)
28 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId, Terms(..))
29 import Gargantext.Core.Types.Individu (Username)
30 import Gargantext.Core.Types.Main
31 import Gargantext.Core (Lang(..))
32 import Gargantext.Database.Bashql (runCmd') -- , del)
33 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
34 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
35 import Gargantext.Text.Terms (extractTerms)
36 import Gargantext.Database.Node.Document.Add (add)
37 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
38 import Gargantext.Database.Root (getRootCmd)
39 import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, NgramsType(..), text2ngrams)
40 import Gargantext.Database.Schema.Node (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId')
41 import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
42 import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
43 import Gargantext.Database.Schema.User (getUser, UserLight(..))
44 import Gargantext.Database.Types.Node (HyperdataDocument(..))
45 import Gargantext.Database.Types.Node (NodeType(..), NodeId)
46 import Gargantext.Database.Utils (Cmd(..))
47 import Gargantext.Text.Terms (TermType(..))
48 import Gargantext.Ext.IMT (toSchoolName)
49 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
50 import Gargantext.Prelude
51 import Gargantext.Text.Parsers (parseDocs, FileFormat)
52 import System.FilePath (FilePath)
53 import qualified Data.Map as DM
55 flowCorpus :: FileFormat -> FilePath -> CorpusName -> IO CorpusId
56 flowCorpus ff fp cName = do
57 hyperdataDocuments' <- map addUniqIdsDoc <$> parseDocs ff fp
58 params <- flowInsert NodeCorpus hyperdataDocuments' cName
59 flowCorpus' NodeCorpus hyperdataDocuments' params
62 flowInsert :: NodeType -> [HyperdataDocument] -> CorpusName
63 -> IO ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
64 flowInsert _nt hyperdataDocuments cName = do
65 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
67 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
68 ids <- runCmd' $ insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
70 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
71 _ <- runCmd' $ add userCorpusId (map reId ids)
73 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
76 flowAnnuaire :: FilePath -> IO ()
77 flowAnnuaire filePath = do
78 contacts <- deserialiseImtUsersFromFile filePath
79 ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
80 printDebug "length annuaire" ps
83 flowInsertAnnuaire :: CorpusName -> [ToDbData]
84 -> IO ([ReturnId], UserId, CorpusId, UserId, CorpusId)
85 flowInsertAnnuaire name children = do
87 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
88 ids <- runCmd' $ insertDocuments masterUserId masterCorpusId NodeContact children
90 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
91 _ <- runCmd' $ add userCorpusId (map reId ids)
93 printDebug "AnnuaireID" userCorpusId
95 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
98 flowCorpus' :: NodeType -> [HyperdataDocument]
99 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
101 flowCorpus' NodeCorpus hyperdataDocuments (ids,masterUserId,masterCorpusId, userId,userCorpusId) = do
102 --------------------------------------------------
104 userListId <- runCmd' $ flowListUser userId userCorpusId
105 printDebug "Working on User ListId : " userListId
107 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
108 -- printDebug "documentsWithId" documentsWithId
109 docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
110 -- printDebug "docsWithNgrams" docsWithNgrams
111 let maps = mapNodeIdNgrams docsWithNgrams
113 -- printDebug "maps" (maps)
114 indexedNgrams <- runCmd' $ indexNgrams maps
115 -- printDebug "inserted ngrams" indexedNgrams
116 _ <- runCmd' $ insertToNodeNgrams indexedNgrams
118 listId2 <- runCmd' $ flowList masterUserId masterCorpusId indexedNgrams
119 printDebug "Working on ListId : " listId2
121 --------------------------------------------------
122 _ <- runCmd' $ mkDashboard userCorpusId userId
123 _ <- runCmd' $ mkGraph userCorpusId userId
126 -- _ <- runCmd' $ mkAnnuaire rootUserId userId
129 -- runCmd' $ del [corpusId2, corpusId]
131 flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
132 flowCorpus' _ _ _ = undefined
135 type CorpusName = Text
137 subFlowCorpus :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
138 subFlowCorpus username cName = do
139 maybeUserId <- runCmd' (getUser username)
141 let userId = case maybeUserId of
142 Nothing -> panic "Error: User does not exist (yet)"
143 -- mk NodeUser gargantua_id "Node Gargantua"
144 Just user -> userLight_id user
146 rootId' <- map _node_id <$> runCmd' (getRootCmd username)
148 rootId'' <- case rootId' of
149 [] -> runCmd' (mkRoot username userId)
150 n -> case length n >= 2 of
151 True -> panic "Error: more than 1 userNode / user"
152 False -> pure rootId'
153 let rootId = maybe (panic "error rootId") identity (head rootId'')
155 corpusId'' <- if username == userMaster
157 ns <- runCmd' $ getCorporaWithParentId' rootId
158 pure $ map _node_id ns
163 corpusId' <- if corpusId'' /= []
165 else runCmd' $ mkCorpus (Just cName) Nothing rootId userId
167 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
169 printDebug "(username, userId, rootId, corpusId)"
170 (username, userId, rootId, corpusId)
171 pure (userId, rootId, corpusId)
174 subFlowAnnuaire :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
175 subFlowAnnuaire username _cName = do
176 maybeUserId <- runCmd' (getUser username)
178 let userId = case maybeUserId of
179 Nothing -> panic "Error: User does not exist (yet)"
180 -- mk NodeUser gargantua_id "Node Gargantua"
181 Just user -> userLight_id user
183 rootId' <- map _node_id <$> runCmd' (getRootCmd username)
185 rootId'' <- case rootId' of
186 [] -> runCmd' (mkRoot username userId)
187 n -> case length n >= 2 of
188 True -> panic "Error: more than 1 userNode / user"
189 False -> pure rootId'
190 let rootId = maybe (panic "error rootId") identity (head rootId'')
192 corpusId' <- runCmd' $ mkAnnuaire rootId userId
194 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
196 printDebug "(username, userId, rootId, corpusId)"
197 (username, userId, rootId, corpusId)
198 pure (userId, rootId, corpusId)
202 ------------------------------------------------------------------------
203 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
204 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
206 err = "Database.Flow.toInsert"
208 toInserted :: [ReturnId] -> Map HashId ReturnId
209 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
210 . filter (\r -> reInserted r == True)
212 data DocumentWithId =
213 DocumentWithId { documentId :: NodeId
214 , documentData :: HyperdataDocument
217 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
218 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
220 toDocumentWithId (hash,hpd) =
221 DocumentWithId <$> fmap reId (lookup hash rs)
224 ------------------------------------------------------------------------
226 data DocumentIdWithNgrams =
228 { documentWithId :: DocumentWithId
229 , document_ngrams :: Map (NgramsT Ngrams) Int
232 -- TODO add Terms (Title + Abstract)
233 -- add f :: Text -> Text
234 -- newtype Ngrams = Ngrams Text
236 extractNgramsT :: HyperdataDocument -> IO (Map (NgramsT Ngrams) Int)
237 extractNgramsT doc = do
239 let source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
240 let institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
241 let authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
242 let leText = catMaybes [_hyperdataDocument_title doc, _hyperdataDocument_abstract doc]
243 terms' <- map text2ngrams <$> map (intercalate " " . _terms_label) <$> concat <$> extractTerms (Multi EN) leText
245 pure $ DM.fromList $ [(NgramsT Sources source, 1)]
246 <> [(NgramsT Institutes i' , 1)| i' <- institutes ]
247 <> [(NgramsT Authors a' , 1)| a' <- authors ]
248 <> [(NgramsT NgramsTerms t' , 1)| t' <- terms' ]
253 documentIdWithNgrams :: (HyperdataDocument -> IO (Map (NgramsT Ngrams) Int))
254 -> [DocumentWithId] -> IO [DocumentIdWithNgrams]
255 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
257 toDocumentIdWithNgrams d = do
258 e <- f $ documentData d
259 pure $ DocumentIdWithNgrams d e
261 -- | TODO check optimization
262 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
263 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
265 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
266 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
268 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
269 -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
270 indexNgrams ng2nId = do
271 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
272 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
275 ------------------------------------------------------------------------
276 ------------------------------------------------------------------------
277 flowList :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
278 flowList uId cId ngs = do
279 -- printDebug "ngs:" ngs
280 lId <- getOrMkList cId uId
281 --printDebug "ngs" (DM.keys ngs)
282 -- TODO add stemming equivalence of 2 ngrams
283 let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
284 _ <- insertGroups lId groupEd
286 -- compute Candidate / Map
287 let lists = ngrams2list ngs
288 -- printDebug "lists:" lists
290 is <- insertLists lId lists
291 printDebug "listNgrams inserted :" is
295 flowListUser :: UserId -> CorpusId -> Cmd Int
296 flowListUser uId cId = getOrMkList cId uId
298 ------------------------------------------------------------------------
300 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
301 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
302 -> Map NgramsIndexed NgramsIndexed
303 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
307 -- TODO check: do not insert duplicates
308 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
309 insertGroups lId ngrs =
310 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
311 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
315 ------------------------------------------------------------------------
316 -- TODO: verify NgramsT lost here
317 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> [(ListType,NgramsIndexed)]
318 ngrams2list = zip (repeat CandidateList) . map (\(NgramsT _lost_t ng) -> ng) . DM.keys
320 -- | TODO: weight of the list could be a probability
321 insertLists :: ListId -> [(ListType,NgramsIndexed)] -> Cmd Int
322 insertLists lId lngs =
323 insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listTypeId l) (listTypeId l)
324 | (l,ngr) <- map (second _ngramsId) lngs
327 ------------------------------------------------------------------------
328 ------------------------------------------------------------------------