]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[Annuaire] types to import users.
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE DeriveGeneric #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15
16 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
17 where
18
19 import GHC.Show (Show)
20 import System.FilePath (FilePath)
21 import Data.Maybe (Maybe(..), catMaybes)
22 import Data.Text (Text, splitOn)
23 import Data.Map (Map, lookup)
24 import Data.Tuple.Extra (both, second)
25 import qualified Data.Map as DM
26
27 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId)
28 import Gargantext.Database.Bashql (runCmd') -- , del)
29 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
30 import Gargantext.Database.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, ngramsTypeId, NgramsType(..), text2ngrams)
31 import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus, Cmd(..), mkList, mkGraph, mkDashboard)--, mkAnnuaire)
32 import Gargantext.Database.Types.Node (NodeType(..))
33 import Gargantext.Database.Node.Document.Add (add)
34 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
35 import Gargantext.Database.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
36 import Gargantext.Database.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
37 import Gargantext.Database.Types.Node (HyperdataDocument(..))
38 import Gargantext.Database.Node.Contact (HyperdataContact(..))
39 import Gargantext.Database.User (getUser, UserLight(..), Username)
40 import Gargantext.Ext.IMT (toSchoolName)
41 import Gargantext.Prelude
42 import Gargantext.Text.Parsers (parseDocs, FileFormat)
43
44 type UserId = Int
45 type MasterUserId = Int
46
47 type RootId = Int
48 type CorpusId = Int
49
50
51 flowDatabase :: FileFormat -> FilePath -> CorpusName -> IO CorpusId
52 flowDatabase ff fp cName = do
53 -- Corpus Flow
54 hyperdataDocuments <- map addUniqIdsDoc <$> parseDocs ff fp
55 params <- flowInsert NodeCorpus hyperdataDocuments cName
56 flowCorpus NodeCorpus hyperdataDocuments params
57
58
59 flowInsert :: NodeType -> [HyperdataDocument] -> CorpusName
60 -> IO ([ReturnId], MasterUserId, UserId, CorpusId)
61 flowInsert nt hyperdataDocuments cName = do
62 let hyperdataDocuments' = case nt of
63 NodeCorpus -> map (\h -> ToDbDocument h) hyperdataDocuments
64 -- NodeAnnuaire -> map (\h -> ToDbContact h) hyperdataDocuments
65
66 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
67 ids <- runCmd' $ insertDocuments masterUserId masterCorpusId hyperdataDocuments'
68
69 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
70 _ <- runCmd' $ add userCorpusId (map reId ids)
71
72 pure (ids, masterUserId, userId, userCorpusId)
73
74 --{-
75 flowInsertAnnuaire name children = do
76
77 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
78 ids <- runCmd' $ insertDocuments masterUserId masterCorpusId children
79
80 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary name
81 _ <- runCmd' $ add userCorpusId (map reId ids)
82
83 printDebug "AnnuaireID" userCorpusId
84
85 pure (ids, masterUserId, userId, userCorpusId)
86
87
88 --}
89
90 --{-
91 -- flowCorpus :: NodeType -> [HyperdataDocument] -> ([ReturnId],MasterUserId,UserId,CorpusId) -> IO CorpusId
92 flowCorpus NodeCorpus hyperdataDocuments (ids,masterUserId,userId,userCorpusId) = do
93 --}
94 --------------------------------------------------
95 -- List Ngrams Flow
96 userListId <- runCmd' $ listFlowUser userId userCorpusId
97 printDebug "Working on User ListId : " userListId
98
99 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
100 -- printDebug "documentsWithId" documentsWithId
101 let docsWithNgrams = documentIdWithNgrams extractNgramsT documentsWithId
102 -- printDebug "docsWithNgrams" docsWithNgrams
103 let maps = mapNodeIdNgrams docsWithNgrams
104
105 -- printDebug "maps" (maps)
106 indexedNgrams <- runCmd' $ indexNgrams maps
107 -- printDebug "inserted ngrams" indexedNgrams
108 _ <- runCmd' $ insertToNodeNgrams indexedNgrams
109
110 listId2 <- runCmd' $ listFlow masterUserId userCorpusId indexedNgrams
111 printDebug "Working on ListId : " listId2
112 --}
113
114 --------------------------------------------------
115 _ <- runCmd' $ mkDashboard userCorpusId userId
116 _ <- runCmd' $ mkGraph userCorpusId userId
117
118 -- Annuaire Flow
119 -- _ <- runCmd' $ mkAnnuaire rootUserId userId
120
121 pure userCorpusId
122 -- runCmd' $ del [corpusId2, corpusId]
123
124 flowCorpus NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_userId,_userCorpusId) = undefined
125 flowCorpus _ _ _ = undefined
126
127
128 type CorpusName = Text
129
130 subFlowCorpus :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
131 subFlowCorpus username cName = do
132 maybeUserId <- runCmd' (getUser username)
133
134 let userId = case maybeUserId of
135 Nothing -> panic "Error: User does not exist (yet)"
136 -- mk NodeUser gargantua_id "Node Gargantua"
137 Just user -> userLight_id user
138
139 rootId' <- map _node_id <$> runCmd' (getRoot userId)
140
141 rootId'' <- case rootId' of
142 [] -> runCmd' (mkRoot username userId)
143 n -> case length n >= 2 of
144 True -> panic "Error: more than 1 userNode / user"
145 False -> pure rootId'
146 let rootId = maybe (panic "error rootId") identity (head rootId'')
147
148 corpusId' <- runCmd' $ mkCorpus (Just cName) Nothing rootId userId
149 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
150
151 printDebug "(username, userId, rootId, corpusId)"
152 (username, userId, rootId, corpusId)
153 pure (userId, rootId, corpusId)
154
155 ------------------------------------------------------------------------
156
157 type HashId = Text
158 type NodeId = Int
159 type ListId = Int
160
161 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
162 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
163 where
164 err = "Database.Flow.toInsert"
165
166 toInserted :: [ReturnId] -> Map HashId ReturnId
167 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
168 . filter (\r -> reInserted r == True)
169
170 data DocumentWithId =
171 DocumentWithId { documentId :: NodeId
172 , documentData :: HyperdataDocument
173 } deriving (Show)
174
175 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
176 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
177 where
178 toDocumentWithId (hash,hpd) =
179 DocumentWithId <$> fmap reId (lookup hash rs)
180 <*> Just hpd
181
182 ------------------------------------------------------------------------
183
184 data DocumentIdWithNgrams =
185 DocumentIdWithNgrams
186 { documentWithId :: DocumentWithId
187 , document_ngrams :: Map (NgramsT Ngrams) Int
188 } deriving (Show)
189
190 -- TODO add Terms (Title + Abstract)
191 -- add f :: Text -> Text
192 -- newtype Ngrams = Ngrams Text
193 extractNgramsT :: HyperdataDocument -> Map (NgramsT Ngrams) Int
194 extractNgramsT doc = DM.fromList $ [(NgramsT Sources source, 1)]
195 <> [(NgramsT Institutes i' , 1)| i' <- institutes ]
196 <> [(NgramsT Authors a' , 1)| a' <- authors ]
197 where
198 source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
199 institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
200 authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
201 -- TODO group terms
202
203
204
205
206 documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
207 -> [DocumentWithId] -> [DocumentIdWithNgrams]
208 documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
209
210 -- | TODO check optimization
211 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
212 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
213 where
214 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
215 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
216
217 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
218 -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
219 indexNgrams ng2nId = do
220 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
221 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
222
223
224 insertToNodeNgrams :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd Int
225 insertToNodeNgrams m = insertNodeNgrams [ NodeNgram Nothing nId ((_ngramsId . _ngramsT ) ng)
226 (fromIntegral n) ((ngramsTypeId . _ngramsType) ng)
227 | (ng, nId2int) <- DM.toList m
228 , (nId, n) <- DM.toList nId2int
229 ]
230
231
232 ------------------------------------------------------------------------
233 ------------------------------------------------------------------------
234 listFlow :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
235 listFlow uId cId ngs = do
236 -- printDebug "ngs:" ngs
237 lId <- maybe (panic "mkList error") identity <$> head <$> mkList cId uId
238 --printDebug "ngs" (DM.keys ngs)
239 -- TODO add stemming equivalence of 2 ngrams
240 let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
241 _ <- insertGroups lId groupEd
242
243 -- compute Candidate / Map
244 let lists = ngrams2list ngs
245 -- printDebug "lists:" lists
246
247 is <- insertLists lId lists
248 printDebug "listNgrams inserted :" is
249
250 pure lId
251
252 listFlowUser :: UserId -> CorpusId -> Cmd [Int]
253 listFlowUser uId cId = mkList cId uId
254
255 ------------------------------------------------------------------------
256
257 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
258 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
259 -> Map NgramsIndexed NgramsIndexed
260 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
261
262
263
264 -- TODO check: do not insert duplicates
265 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
266 insertGroups lId ngrs =
267 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
268 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
269 , ng1 /= ng2
270 ]
271
272 ------------------------------------------------------------------------
273 -- TODO: verify NgramsT lost here
274 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> [(ListType,NgramsIndexed)]
275 ngrams2list = zip (repeat CandidateList) . map (\(NgramsT _lost_t ng) -> ng) . DM.keys
276
277 -- | TODO: weight of the list could be a probability
278 insertLists :: ListId -> [(ListType,NgramsIndexed)] -> Cmd Int
279 insertLists lId lngs =
280 insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listTypeId l) (listTypeId l)
281 | (l,ngr) <- map (second _ngramsId) lngs
282 ]
283
284 ------------------------------------------------------------------------
285 ------------------------------------------------------------------------
286