]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[Database][Schema][Ngrams]
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE DeriveGeneric #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15
16 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
17 where
18
19 import GHC.Show (Show)
20 --import Control.Lens (view)
21 import System.FilePath (FilePath)
22 import Data.Maybe (Maybe(..), catMaybes)
23 import Data.Text (Text, splitOn)
24 import Data.Map (Map, lookup)
25 import Data.Tuple.Extra (both, second)
26 import qualified Data.Map as DM
27 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId)
28 import Gargantext.Database.Bashql (runCmd') -- , del)
29 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
30 import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, NgramsType(..), text2ngrams)
31 import Gargantext.Database.Node (mkRoot, mkCorpus, Cmd(..), mkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId')
32 import Gargantext.Database.Root (getRootCmd)
33 import Gargantext.Database.Types.Node (NodeType(..), NodeId)
34 import Gargantext.Database.Node.Document.Add (add)
35 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
36 import Gargantext.Database.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
37 import Gargantext.Database.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
38 import Gargantext.Database.Types.Node (HyperdataDocument(..))
39 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
40 import Gargantext.Database.User (getUser, UserLight(..))
41 import Gargantext.Core.Types.Individu (Username)
42 import Gargantext.Ext.IMT (toSchoolName)
43 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
44 import Gargantext.Prelude
45 import Gargantext.Text.Parsers (parseDocs, FileFormat)
46 import Gargantext.Core.Types.Main
47 --import Gargantext.Core.Types
48 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
49
50 flowCorpus :: FileFormat -> FilePath -> CorpusName -> IO CorpusId
51 flowCorpus ff fp cName = do
52 hyperdataDocuments' <- map addUniqIdsDoc <$> parseDocs ff fp
53 params <- flowInsert NodeCorpus hyperdataDocuments' cName
54 flowCorpus' NodeCorpus hyperdataDocuments' params
55
56
57 flowInsert :: NodeType -> [HyperdataDocument] -> CorpusName
58 -> IO ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
59 flowInsert _nt hyperdataDocuments cName = do
60 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
61
62 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
63 ids <- runCmd' $ insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
64
65 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
66 _ <- runCmd' $ add userCorpusId (map reId ids)
67
68 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
69
70
71 flowAnnuaire :: FilePath -> IO ()
72 flowAnnuaire filePath = do
73 contacts <- deserialiseImtUsersFromFile filePath
74 ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
75 printDebug "length annuaire" ps
76
77
78 flowInsertAnnuaire :: CorpusName -> [ToDbData]
79 -> IO ([ReturnId], UserId, CorpusId, UserId, CorpusId)
80 flowInsertAnnuaire name children = do
81
82 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
83 ids <- runCmd' $ insertDocuments masterUserId masterCorpusId NodeContact children
84
85 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
86 _ <- runCmd' $ add userCorpusId (map reId ids)
87
88 printDebug "AnnuaireID" userCorpusId
89
90 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
91
92
93 flowCorpus' :: NodeType -> [HyperdataDocument]
94 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
95 -> IO CorpusId
96 flowCorpus' NodeCorpus hyperdataDocuments (ids,masterUserId,masterCorpusId, userId,userCorpusId) = do
97 --------------------------------------------------
98 -- List Ngrams Flow
99 userListId <- runCmd' $ flowListUser userId userCorpusId
100 printDebug "Working on User ListId : " userListId
101
102 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
103 -- printDebug "documentsWithId" documentsWithId
104 let docsWithNgrams = documentIdWithNgrams extractNgramsT documentsWithId
105 -- printDebug "docsWithNgrams" docsWithNgrams
106 let maps = mapNodeIdNgrams docsWithNgrams
107
108 -- printDebug "maps" (maps)
109 indexedNgrams <- runCmd' $ indexNgrams maps
110 -- printDebug "inserted ngrams" indexedNgrams
111 _ <- runCmd' $ insertToNodeNgrams indexedNgrams
112
113 listId2 <- runCmd' $ flowList masterUserId masterCorpusId indexedNgrams
114 printDebug "Working on ListId : " listId2
115 --}
116 --------------------------------------------------
117 _ <- runCmd' $ mkDashboard userCorpusId userId
118 _ <- runCmd' $ mkGraph userCorpusId userId
119
120 -- Annuaire Flow
121 -- _ <- runCmd' $ mkAnnuaire rootUserId userId
122
123 pure userCorpusId
124 -- runCmd' $ del [corpusId2, corpusId]
125
126 flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
127 flowCorpus' _ _ _ = undefined
128
129
130 type CorpusName = Text
131
132 subFlowCorpus :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
133 subFlowCorpus username cName = do
134 maybeUserId <- runCmd' (getUser username)
135
136 let userId = case maybeUserId of
137 Nothing -> panic "Error: User does not exist (yet)"
138 -- mk NodeUser gargantua_id "Node Gargantua"
139 Just user -> userLight_id user
140
141 rootId' <- map _node_id <$> runCmd' (getRootCmd username)
142
143 rootId'' <- case rootId' of
144 [] -> runCmd' (mkRoot username userId)
145 n -> case length n >= 2 of
146 True -> panic "Error: more than 1 userNode / user"
147 False -> pure rootId'
148 let rootId = maybe (panic "error rootId") identity (head rootId'')
149 --{-
150 corpusId'' <- if username == userMaster
151 then do
152 ns <- runCmd' $ getCorporaWithParentId' rootId
153 pure $ map _node_id ns
154 else
155 pure []
156
157 --}
158 corpusId' <- if corpusId'' /= []
159 then pure corpusId''
160 else runCmd' $ mkCorpus (Just cName) Nothing rootId userId
161
162 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
163
164 printDebug "(username, userId, rootId, corpusId)"
165 (username, userId, rootId, corpusId)
166 pure (userId, rootId, corpusId)
167
168
169 subFlowAnnuaire :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
170 subFlowAnnuaire username _cName = do
171 maybeUserId <- runCmd' (getUser username)
172
173 let userId = case maybeUserId of
174 Nothing -> panic "Error: User does not exist (yet)"
175 -- mk NodeUser gargantua_id "Node Gargantua"
176 Just user -> userLight_id user
177
178 rootId' <- map _node_id <$> runCmd' (getRootCmd username)
179
180 rootId'' <- case rootId' of
181 [] -> runCmd' (mkRoot username userId)
182 n -> case length n >= 2 of
183 True -> panic "Error: more than 1 userNode / user"
184 False -> pure rootId'
185 let rootId = maybe (panic "error rootId") identity (head rootId'')
186
187 corpusId' <- runCmd' $ mkAnnuaire rootId userId
188
189 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
190
191 printDebug "(username, userId, rootId, corpusId)"
192 (username, userId, rootId, corpusId)
193 pure (userId, rootId, corpusId)
194
195
196
197 ------------------------------------------------------------------------
198 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
199 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
200 where
201 err = "Database.Flow.toInsert"
202
203 toInserted :: [ReturnId] -> Map HashId ReturnId
204 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
205 . filter (\r -> reInserted r == True)
206
207 data DocumentWithId =
208 DocumentWithId { documentId :: NodeId
209 , documentData :: HyperdataDocument
210 } deriving (Show)
211
212 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
213 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
214 where
215 toDocumentWithId (hash,hpd) =
216 DocumentWithId <$> fmap reId (lookup hash rs)
217 <*> Just hpd
218
219 ------------------------------------------------------------------------
220
221 data DocumentIdWithNgrams =
222 DocumentIdWithNgrams
223 { documentWithId :: DocumentWithId
224 , document_ngrams :: Map (NgramsT Ngrams) Int
225 } deriving (Show)
226
227 -- TODO add Terms (Title + Abstract)
228 -- add f :: Text -> Text
229 -- newtype Ngrams = Ngrams Text
230 extractNgramsT :: HyperdataDocument -> Map (NgramsT Ngrams) Int
231 extractNgramsT doc = DM.fromList $ [(NgramsT Sources source, 1)]
232 <> [(NgramsT Institutes i' , 1)| i' <- institutes ]
233 <> [(NgramsT Authors a' , 1)| a' <- authors ]
234 where
235 source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
236 institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
237 authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
238 -- TODO group terms
239
240
241
242
243 documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
244 -> [DocumentWithId] -> [DocumentIdWithNgrams]
245 documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
246
247 -- | TODO check optimization
248 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
249 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
250 where
251 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
252 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
253
254 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
255 -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
256 indexNgrams ng2nId = do
257 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
258 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
259
260
261 ------------------------------------------------------------------------
262 ------------------------------------------------------------------------
263 flowList :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
264 flowList uId cId ngs = do
265 -- printDebug "ngs:" ngs
266 lId <- maybe (panic "mkList error") identity <$> head <$> mkList cId uId
267 --printDebug "ngs" (DM.keys ngs)
268 -- TODO add stemming equivalence of 2 ngrams
269 let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
270 _ <- insertGroups lId groupEd
271
272 -- compute Candidate / Map
273 let lists = ngrams2list ngs
274 -- printDebug "lists:" lists
275
276 is <- insertLists lId lists
277 printDebug "listNgrams inserted :" is
278
279 pure lId
280
281 flowListUser :: UserId -> CorpusId -> Cmd [Int]
282 flowListUser uId cId = mkList cId uId
283
284 ------------------------------------------------------------------------
285
286 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
287 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
288 -> Map NgramsIndexed NgramsIndexed
289 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
290
291
292
293 -- TODO check: do not insert duplicates
294 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
295 insertGroups lId ngrs =
296 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
297 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
298 , ng1 /= ng2
299 ]
300
301 ------------------------------------------------------------------------
302 -- TODO: verify NgramsT lost here
303 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> [(ListType,NgramsIndexed)]
304 ngrams2list = zip (repeat CandidateList) . map (\(NgramsT _lost_t ng) -> ng) . DM.keys
305
306 -- | TODO: weight of the list could be a probability
307 insertLists :: ListId -> [(ListType,NgramsIndexed)] -> Cmd Int
308 insertLists lId lngs =
309 insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listTypeId l) (listTypeId l)
310 | (l,ngr) <- map (second _ngramsId) lngs
311 ]
312
313 ------------------------------------------------------------------------
314 ------------------------------------------------------------------------
315