]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
Merge branch 'master' of ssh://gitlab.iscpif.fr:20022/gargantext/haskell-gargantext
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE DeriveGeneric #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15
16 module Gargantext.Database.Flow (flowDatabase, ngrams2list)
17 where
18
19 import GHC.Show (Show)
20 import System.FilePath (FilePath)
21 import Data.Maybe (Maybe(..), catMaybes)
22 import Data.Text (Text, splitOn)
23 import Data.Map (Map, lookup)
24 import Data.Tuple.Extra (both, second)
25 import qualified Data.Map as DM
26
27 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId)
28 import Gargantext.Database.Bashql (runCmd') -- , del)
29 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
30 import Gargantext.Database.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, ngramsTypeId, NgramsType(..), text2ngrams)
31 import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus, Cmd(..), mkList, mkGraph, mkDashboard)--, mkAnnuaire)
32 import Gargantext.Database.Node.Document.Add (add)
33 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, ToDbData(..))
34 import Gargantext.Database.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
35 import Gargantext.Database.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
36 import Gargantext.Database.Types.Node (HyperdataDocument(..))
37 -- import Gargantext.Database.Node.Contact (HyperdataContact(..))
38 import Gargantext.Database.User (getUser, UserLight(..), Username)
39 import Gargantext.Ext.IMT (toSchoolName)
40 import Gargantext.Prelude
41 import Gargantext.Text.Parsers (parseDocs, FileFormat)
42
43 type UserId = Int
44 type RootId = Int
45 type CorpusId = Int
46
47 {-
48 flowCorpus :: [ToDbData] -> CorpusName -> IO CorpusId
49 flowCorpus = undefined
50 --}
51
52 flowDatabase :: FileFormat -> FilePath -> CorpusName -> IO Int
53 flowDatabase ff fp cName = do
54 -- Corpus Flow
55 (masterUserId, _, corpusId) <- subFlowCorpus userMaster corpusMasterName
56
57 -- Documents Flow
58 hyperdataDocuments <- map addUniqIdsDoc <$> parseDocs ff fp
59 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
60 printDebug "hyperdataDocuments" (length hyperdataDocuments)
61
62
63
64
65 ids <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments'
66 -- printDebug "Docs IDs : " (ids)
67 -- idsRepeat <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments'
68 -- printDebug "Repeated Docs IDs : " (length idsRepeat)
69 let idsNotRepeated = filter (\r -> reInserted r == True) ids
70 --{-
71 -- Ngrams Flow
72 -- todo: flow for new documents only
73 let tids = toInserted ids
74 printDebug "toInserted ids" (length tids)
75
76 let tihs = toInsert hyperdataDocuments
77 printDebug "toInsert hyperdataDocuments" (length tihs)
78
79 let documentsWithId = mergeData (toInserted idsNotRepeated) (toInsert hyperdataDocuments)
80 -- printDebug "documentsWithId" documentsWithId
81
82 -- docsWithNgrams <- documentIdWithNgrams documentsWithId extractNgramsT
83 let docsWithNgrams = documentIdWithNgrams extractNgramsT documentsWithId
84 -- printDebug "docsWithNgrams" docsWithNgrams
85
86 let maps = mapNodeIdNgrams docsWithNgrams
87 -- printDebug "maps" (maps)
88
89 indexedNgrams <- runCmd' $ indexNgrams maps
90 -- printDebug "inserted ngrams" indexedNgrams
91 _ <- runCmd' $ insertToNodeNgrams indexedNgrams
92
93 -- List Flow
94 listId2 <- runCmd' $ listFlow masterUserId corpusId indexedNgrams
95 printDebug "list id : " listId2
96
97 --}
98 (userId, _, corpusId2) <- subFlowCorpus userArbitrary cName
99
100 userListId <- runCmd' $ listFlowUser userId corpusId2
101 printDebug "UserList : " userListId
102 inserted <- runCmd' $ add corpusId2 (map reId ids)
103 printDebug "Added : " (length inserted)
104
105 _ <- runCmd' $ mkDashboard corpusId2 userId
106 _ <- runCmd' $ mkGraph corpusId2 userId
107
108 -- Annuaire Flow
109 -- _ <- runCmd' $ mkAnnuaire rootUserId userId
110
111 pure corpusId2
112 -- runCmd' $ del [corpusId2, corpusId]
113
114 type CorpusName = Text
115
116 subFlowCorpus :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
117 subFlowCorpus username cName = do
118 maybeUserId <- runCmd' (getUser username)
119
120 let userId = case maybeUserId of
121 Nothing -> panic "Error: User does not exist (yet)"
122 -- mk NodeUser gargantua_id "Node Gargantua"
123 Just user -> userLight_id user
124
125 rootId' <- map _node_id <$> runCmd' (getRoot userId)
126
127 rootId'' <- case rootId' of
128 [] -> runCmd' (mkRoot username userId)
129 n -> case length n >= 2 of
130 True -> panic "Error: more than 1 userNode / user"
131 False -> pure rootId'
132 let rootId = maybe (panic "error rootId") identity (head rootId'')
133
134 corpusId' <- runCmd' $ mkCorpus (Just cName) Nothing rootId userId
135 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
136
137 printDebug "(username, userId, rootId, corpusId)"
138 (username, userId, rootId, corpusId)
139 pure (userId, rootId, corpusId)
140
141 ------------------------------------------------------------------------
142
143 type HashId = Text
144 type NodeId = Int
145 type ListId = Int
146
147 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
148 toInsert = DM.fromList . map (\d -> (hash (_hyperdataDocument_uniqId d), d))
149 where
150 hash = maybe "Error" identity
151
152 toInserted :: [ReturnId] -> Map HashId ReturnId
153 toInserted rs = DM.fromList $ map (\r -> (reUniqId r, r) )
154 $ filter (\r -> reInserted r == True) rs
155
156 data DocumentWithId =
157 DocumentWithId { documentId :: NodeId
158 , documentData :: HyperdataDocument
159 } deriving (Show)
160
161 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
162 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
163 where
164 toDocumentWithId (hash,hpd) =
165 DocumentWithId <$> fmap reId (lookup hash rs)
166 <*> Just hpd
167
168 ------------------------------------------------------------------------
169
170 data DocumentIdWithNgrams =
171 DocumentIdWithNgrams
172 { documentWithId :: DocumentWithId
173 , document_ngrams :: Map (NgramsT Ngrams) Int
174 } deriving (Show)
175
176 -- TODO add Terms (Title + Abstract)
177 -- add f :: Text -> Text
178 -- newtype Ngrams = Ngrams Text
179 extractNgramsT :: HyperdataDocument -> Map (NgramsT Ngrams) Int
180 extractNgramsT doc = DM.fromList $ [(NgramsT Sources source, 1)]
181 <> [(NgramsT Institutes i' , 1)| i' <- institutes ]
182 <> [(NgramsT Authors a' , 1)| a' <- authors ]
183 where
184 source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
185 institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
186 authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
187 -- TODO group terms
188
189 documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
190 -> [DocumentWithId] -> [DocumentIdWithNgrams]
191 documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
192
193 -- | TODO check optimization
194 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
195 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
196 where
197 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
198 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
199
200 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
201 -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
202 indexNgrams ng2nId = do
203 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
204 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
205
206
207 insertToNodeNgrams :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd Int
208 insertToNodeNgrams m = insertNodeNgrams [ NodeNgram Nothing nId ((_ngramsId . _ngramsT ) ng)
209 (fromIntegral n) ((ngramsTypeId . _ngramsType) ng)
210 | (ng, nId2int) <- DM.toList m
211 , (nId, n) <- DM.toList nId2int
212 ]
213
214
215 ------------------------------------------------------------------------
216 ------------------------------------------------------------------------
217 listFlow :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
218 listFlow uId cId ngs = do
219 -- printDebug "ngs:" ngs
220 lId <- maybe (panic "mkList error") identity <$> head <$> mkList cId uId
221 --printDebug "ngs" (DM.keys ngs)
222 -- TODO add stemming equivalence of 2 ngrams
223 let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
224 _ <- insertGroups lId groupEd
225
226 -- compute Candidate / Map
227 let lists = ngrams2list ngs
228 -- printDebug "lists:" lists
229
230 is <- insertLists lId lists
231 printDebug "listNgrams inserted :" is
232
233 pure lId
234
235 listFlowUser :: UserId -> CorpusId -> Cmd [Int]
236 listFlowUser uId cId = mkList cId uId
237
238 ------------------------------------------------------------------------
239
240 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
241 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
242 -> Map NgramsIndexed NgramsIndexed
243 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
244
245
246
247 -- TODO check: do not insert duplicates
248 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
249 insertGroups lId ngrs =
250 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
251 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
252 , ng1 /= ng2
253 ]
254
255 ------------------------------------------------------------------------
256 -- TODO: verify NgramsT lost here
257 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> [(ListType,NgramsIndexed)]
258 ngrams2list = zip (repeat CandidateList) . map (\(NgramsT _lost_t ng) -> ng) . DM.keys
259
260 -- | TODO: weight of the list could be a probability
261 insertLists :: ListId -> [(ListType,NgramsIndexed)] -> Cmd Int
262 insertLists lId lngs =
263 insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listTypeId l) (listTypeId l)
264 | (l,ngr) <- map (second _ngramsId) lngs
265 ]
266
267 ------------------------------------------------------------------------
268 ------------------------------------------------------------------------
269