1 {-# OPTIONS_GHC -fno-warn-unused-top-binds #-}
3 Module : Gargantext.API.Ngrams
4 Description : Server API
5 Copyright : (c) CNRS, 2017-Present
6 License : AGPL + CECILL v3
7 Maintainer : team@gargantext.org
8 Stability : experimental
14 get ngrams filtered by NgramsType
19 {-# LANGUAGE ConstraintKinds #-}
20 {-# LANGUAGE ScopedTypeVariables #-}
21 {-# LANGUAGE TemplateHaskell #-}
22 {-# LANGUAGE TypeOperators #-}
23 {-# LANGUAGE TypeFamilies #-}
25 module Gargantext.API.Ngrams
32 --, rmListNgrams TODO fix before exporting
33 , apiNgramsTableCorpus
56 , NgramsRepoElement(..)
82 , listNgramsChangedSince
86 import Control.Concurrent
87 import Control.Lens ((.~), view, (^.), (^..), (+~), (%~), (.~), sumOf, at, _Just, Each(..), (%%~), mapped)
88 import Control.Monad.Reader
89 import Data.Aeson hiding ((.=))
90 import qualified Data.Aeson.Text as DAT
91 import Data.Either (Either(..))
93 import qualified Data.List as List
94 import Data.Map.Strict (Map)
95 import qualified Data.Map.Strict as Map
96 import qualified Data.Map.Strict.Patch as PM
97 import Data.Maybe (fromMaybe)
99 import Data.Ord (Down(..))
100 import Data.Patch.Class (Action(act), Transformable(..), ours)
101 import qualified Data.Set as S
102 import qualified Data.Set as Set
103 import Data.Swagger hiding (version, patch)
104 import Data.Text (Text, isInfixOf, unpack)
105 import Data.Text.Lazy.IO as DTL
106 import Formatting (hprint, int, (%))
107 import Formatting.Clock (timeSpecs)
108 import GHC.Generics (Generic)
109 import Servant hiding (Patch)
110 import System.Clock (getTime, TimeSpec, Clock(..))
111 import System.IO (stderr)
112 import Test.QuickCheck (elements)
113 import Test.QuickCheck.Arbitrary (Arbitrary, arbitrary)
115 import Prelude (error)
116 import Gargantext.Prelude
118 import Gargantext.API.Admin.Types (HasSettings)
119 import qualified Gargantext.API.Metrics as Metrics
120 import Gargantext.API.Ngrams.Types
121 import Gargantext.Core.Types (ListType(..), NodeId, ListId, DocId, Limit, Offset, HasInvalidError, TODO, assertValid)
122 import Gargantext.Core.Utils (something)
123 import Gargantext.Core.Viz.Graph.API (recomputeGraph)
124 import Gargantext.Core.Viz.Graph.Distances (Distance(Conditional))
125 import Gargantext.Database.Action.Metrics.NgramsByNode (getOccByNgramsOnlyFast')
126 import Gargantext.Database.Query.Table.Node.Select
127 import Gargantext.Database.Query.Table.Ngrams hiding (NgramsType(..), ngrams, ngramsType, ngrams_terms)
128 import Gargantext.Database.Admin.Config (userMaster)
129 import Gargantext.Database.Query.Table.Node.Error (HasNodeError)
130 import Gargantext.Database.Admin.Types.Node (NodeType(..))
131 import Gargantext.Database.Prelude (HasConnectionPool, HasConfig)
132 import qualified Gargantext.Database.Query.Table.Ngrams as TableNgrams
133 import Gargantext.Database.Query.Table.Node (getNode)
134 import Gargantext.Database.Schema.Node (NodePoly(..))
137 -- TODO sequences of modifications (Patchs)
138 type NgramsIdPatch = Patch NgramsId NgramsPatch
140 ngramsPatch :: Int -> NgramsPatch
141 ngramsPatch n = NgramsPatch (DM.fromList [(1, StopTerm)]) (Set.fromList [n]) Set.empty
143 toEdit :: NgramsId -> NgramsPatch -> Edit NgramsId NgramsPatch
144 toEdit n p = Edit n p
145 ngramsIdPatch :: Patch NgramsId NgramsPatch
146 ngramsIdPatch = fromList $ catMaybes $ reverse [ replace (1::NgramsId) (Just $ ngramsPatch 1) Nothing
147 , replace (1::NgramsId) Nothing (Just $ ngramsPatch 2)
148 , replace (2::NgramsId) Nothing (Just $ ngramsPatch 2)
151 -- applyPatchBack :: Patch -> IO Patch
152 -- isEmptyPatch = Map.all (\x -> Set.isEmpty (add_children x) && Set.isEmpty ... )
154 ------------------------------------------------------------------------
155 ------------------------------------------------------------------------
156 ------------------------------------------------------------------------
159 -- TODO: Replace.old is ignored which means that if the current list
160 -- `MapTerm` and that the patch is `Replace CandidateTerm StopTerm` then
161 -- the list is going to be `StopTerm` while it should keep `MapTerm`.
162 -- However this should not happen in non conflicting situations.
163 mkListsUpdate :: NgramsType -> NgramsTablePatch -> [(NgramsTypeId, NgramsTerm, ListTypeId)]
164 mkListsUpdate nt patches =
165 [ (ngramsTypeId nt, ng, listTypeId lt)
166 | (ng, patch) <- patches ^.. ntp_ngrams_patches . ifolded . withIndex
167 , lt <- patch ^.. patch_list . new
170 mkChildrenGroups :: (PatchSet NgramsTerm -> Set NgramsTerm)
173 -> [(NgramsTypeId, NgramsParent, NgramsChild)]
174 mkChildrenGroups addOrRem nt patches =
175 [ (ngramsTypeId nt, parent, child)
176 | (parent, patch) <- patches ^.. ntp_ngrams_patches . ifolded . withIndex
177 , child <- patch ^.. patch_children . to addOrRem . folded
181 ------------------------------------------------------------------------
183 saveRepo :: ( MonadReader env m, MonadBase IO m, HasRepoSaver env )
185 saveRepo = liftBase =<< view repoSaver
187 listTypeConflictResolution :: ListType -> ListType -> ListType
188 listTypeConflictResolution _ _ = undefined -- TODO Use Map User ListType
190 ngramsStatePatchConflictResolution
191 :: TableNgrams.NgramsType
194 -> ConflictResolutionNgramsPatch
195 ngramsStatePatchConflictResolution _ngramsType _nodeId _ngramsTerm
196 = (ours, (const ours, ours), (False, False))
197 -- (False, False) mean here that Mod has always priority.
198 -- (True, False) <- would mean priority to the left (same as ours).
200 -- undefined {- TODO think this through -}, listTypeConflictResolution)
203 -- Insertions are not considered as patches,
204 -- they do not extend history,
205 -- they do not bump version.
206 insertNewOnly :: a -> Maybe b -> a
207 insertNewOnly m = maybe m (const $ error "insertNewOnly: impossible")
208 -- TODO error handling
211 -- TODO refactor with putListNgrams
212 copyListNgrams :: RepoCmdM env err m
213 => NodeId -> NodeId -> NgramsType
215 copyListNgrams srcListId dstListId ngramsType = do
217 liftBase $ modifyMVar_ var $
218 pure . (r_state . at ngramsType %~ (Just . f . something))
221 f :: Map NodeId NgramsTableMap -> Map NodeId NgramsTableMap
222 f m = m & at dstListId %~ insertNewOnly (m ^. at srcListId)
224 -- TODO refactor with putListNgrams
225 -- The list must be non-empty!
226 -- The added ngrams must be non-existent!
227 addListNgrams :: RepoCmdM env err m
228 => NodeId -> NgramsType
229 -> [NgramsElement] -> m ()
230 addListNgrams listId ngramsType nes = do
232 liftBase $ modifyMVar_ var $
233 pure . (r_state . at ngramsType . _Just . at listId . _Just <>~ m)
236 m = Map.fromList $ (\n -> (n ^. ne_ngrams, n)) <$> nes
240 rmListNgrams :: RepoCmdM env err m
242 -> TableNgrams.NgramsType
244 rmListNgrams l nt = setListNgrams l nt mempty
246 -- | TODO: incr the Version number
247 -- && should use patch
249 setListNgrams :: RepoCmdM env err m
251 -> TableNgrams.NgramsType
252 -> Map NgramsTerm NgramsRepoElement
254 setListNgrams listId ngramsType ns = do
256 liftBase $ modifyMVar_ var $
260 (at listId .~ ( Just ns))
267 currentVersion :: RepoCmdM env err m
271 r <- liftBase $ readMVar var
272 pure $ r ^. r_version
275 -- tableNgramsPut :: (HasInvalidError err, RepoCmdM env err m)
276 commitStatePatch :: RepoCmdM env err m => Versioned NgramsStatePatch -> m (Versioned NgramsStatePatch)
277 commitStatePatch (Versioned p_version p) = do
279 vq' <- liftBase $ modifyMVar var $ \r -> do
281 q = mconcat $ take (r ^. r_version - p_version) (r ^. r_history)
282 (p', q') = transformWith ngramsStatePatchConflictResolution p q
283 r' = r & r_version +~ 1
285 & r_history %~ (p' :)
287 -- Ideally we would like to check these properties. However:
288 -- * They should be checked only to debug the code. The client data
289 -- should be able to trigger these.
290 -- * What kind of error should they throw (we are in IO here)?
291 -- * Should we keep modifyMVar?
292 -- * Should we throw the validation in an Exception, catch it around
293 -- modifyMVar and throw it back as an Error?
294 assertValid $ transformable p q
295 assertValid $ applicable p' (r ^. r_state)
297 pure (r', Versioned (r' ^. r_version) q')
302 -- This is a special case of tableNgramsPut where the input patch is empty.
303 tableNgramsPull :: RepoCmdM env err m
305 -> TableNgrams.NgramsType
307 -> m (Versioned NgramsTablePatch)
308 tableNgramsPull listId ngramsType p_version = do
310 r <- liftBase $ readMVar var
313 q = mconcat $ take (r ^. r_version - p_version) (r ^. r_history)
314 q_table = q ^. _PatchMap . at ngramsType . _Just . _PatchMap . at listId . _Just
316 pure (Versioned (r ^. r_version) q_table)
318 -- Apply the given patch to the DB and returns the patch to be applied on the
321 tableNgramsPut :: (HasNodeError err,
324 HasConnectionPool env,
329 -> Versioned NgramsTablePatch
330 -> m (Versioned NgramsTablePatch)
331 tableNgramsPut tabType listId (Versioned p_version p_table)
332 | p_table == mempty = do
333 let ngramsType = ngramsTypeFromTabType tabType
334 tableNgramsPull listId ngramsType p_version
337 let ngramsType = ngramsTypeFromTabType tabType
338 (p0, p0_validity) = PM.singleton listId p_table
339 (p, p_validity) = PM.singleton ngramsType p0
341 assertValid p0_validity
342 assertValid p_validity
344 ret <- commitStatePatch (Versioned p_version p)
345 <&> v_data %~ (view (_PatchMap . at ngramsType . _Just . _PatchMap . at listId . _Just))
347 node <- getNode listId
348 let nId = _node_id node
349 uId = _node_userId node
350 mCId = _node_parentId node
351 printDebug "[tableNgramsPut] updating graph with nId" nId
352 printDebug "[tableNgramsPut] updating graph with uId" uId
353 _ <- recomputeGraph uId nId Conditional
357 printDebug "[tableNgramsPut] can't update charts, no parent, nId" nId
360 printDebug "[tableNgramsPut] updating scatter cId" cId
361 _ <- Metrics.updateScatter cId (Just listId) tabType Nothing
362 printDebug "[tableNgramsPut] updating chart cId" cId
363 _ <- Metrics.updateChart cId (Just listId) tabType Nothing
364 printDebug "[tableNgramsPut] updating pie cId" cId
365 _ <- Metrics.updatePie cId (Just listId) tabType Nothing
366 printDebug "[tableNgramsPut] updating tree StopTerm, cId" cId
367 _ <- Metrics.updateTree cId (Just listId) tabType StopTerm
368 printDebug "[tableNgramsPut] updating tree CandidateTerm, cId" cId
369 _ <- Metrics.updateTree cId (Just listId) tabType CandidateTerm
370 printDebug "[tableNgramsPut] updating tree MapTerm, cId" cId
371 _ <- Metrics.updateTree cId (Just listId) tabType MapTerm
377 { _ne_list :: ListType
378 If we merge the parents/children we can potentially create cycles!
379 , _ne_parent :: Maybe NgramsTerm
380 , _ne_children :: MSet NgramsTerm
384 getNgramsTableMap :: RepoCmdM env err m
386 -> TableNgrams.NgramsType
387 -> m (Versioned NgramsTableMap)
388 getNgramsTableMap nodeId ngramsType = do
390 repo <- liftBase $ readMVar v
391 pure $ Versioned (repo ^. r_version)
392 (repo ^. r_state . at ngramsType . _Just . at nodeId . _Just)
394 dumpJsonTableMap :: RepoCmdM env err m
397 -> TableNgrams.NgramsType
399 dumpJsonTableMap fpath nodeId ngramsType = do
400 m <- getNgramsTableMap nodeId ngramsType
401 liftBase $ DTL.writeFile (unpack fpath) (DAT.encodeToLazyText m)
407 -- | TODO Errors management
408 -- TODO: polymorphic for Annuaire or Corpus or ...
409 -- | Table of Ngrams is a ListNgrams formatted (sorted and/or cut).
410 -- TODO: should take only one ListId
412 getTime' :: MonadBase IO m => m TimeSpec
413 getTime' = liftBase $ getTime ProcessCPUTime
416 getTableNgrams :: forall env err m.
417 (RepoCmdM env err m, HasNodeError err, HasConnectionPool env, HasConfig env)
418 => NodeType -> NodeId -> TabType
419 -> ListId -> Limit -> Maybe Offset
421 -> Maybe MinSize -> Maybe MaxSize
423 -> (NgramsTerm -> Bool)
424 -> m (Versioned NgramsTable)
425 getTableNgrams _nType nId tabType listId limit_ offset
426 listType minSize maxSize orderBy searchQuery = do
429 -- lIds <- selectNodesWithUsername NodeList userMaster
431 ngramsType = ngramsTypeFromTabType tabType
432 offset' = maybe 0 identity offset
433 listType' = maybe (const True) (==) listType
434 minSize' = maybe (const True) (<=) minSize
435 maxSize' = maybe (const True) (>=) maxSize
437 selected_node n = minSize' s
439 && searchQuery (n ^. ne_ngrams)
440 && listType' (n ^. ne_list)
444 selected_inner roots n = maybe False (`Set.member` roots) (n ^. ne_root)
446 ---------------------------------------
447 sortOnOrder Nothing = identity
448 sortOnOrder (Just TermAsc) = List.sortOn $ view ne_ngrams
449 sortOnOrder (Just TermDesc) = List.sortOn $ Down . view ne_ngrams
450 sortOnOrder (Just ScoreAsc) = List.sortOn $ view ne_occurrences
451 sortOnOrder (Just ScoreDesc) = List.sortOn $ Down . view ne_occurrences
453 ---------------------------------------
454 selectAndPaginate :: Map NgramsTerm NgramsElement -> [NgramsElement]
455 selectAndPaginate tableMap = roots <> inners
457 list = tableMap ^.. each
458 rootOf ne = maybe ne (\r -> fromMaybe (panic "getTableNgrams: invalid root") (tableMap ^. at r))
460 selected_nodes = list & take limit_
462 . filter selected_node
463 . sortOnOrder orderBy
464 roots = rootOf <$> selected_nodes
465 rootsSet = Set.fromList (_ne_ngrams <$> roots)
466 inners = list & filter (selected_inner rootsSet)
468 ---------------------------------------
469 setScores :: forall t. Each t t NgramsElement NgramsElement => Bool -> t -> m t
470 setScores False table = pure table
471 setScores True table = do
472 let ngrams_terms = unNgramsTerm <$> (table ^.. each . ne_ngrams)
474 occurrences <- getOccByNgramsOnlyFast' nId
479 liftBase $ hprint stderr
480 ("getTableNgrams/setScores #ngrams=" % int % " time=" % timeSpecs % "\n")
481 (length ngrams_terms) t1 t2
483 occurrences <- getOccByNgramsOnlySlow nType nId
489 setOcc ne = ne & ne_occurrences .~ sumOf (at (unNgramsTerm (ne ^. ne_ngrams)) . _Just) occurrences
491 pure $ table & each %~ setOcc
492 ---------------------------------------
494 -- lists <- catMaybes <$> listsWith userMaster
495 -- trace (show lists) $
496 -- getNgramsTableMap ({-lists <>-} listIds) ngramsType
498 let scoresNeeded = needsScores orderBy
499 tableMap1 <- getNgramsTableMap listId ngramsType
501 tableMap2 <- tableMap1 & v_data %%~ setScores scoresNeeded
502 . Map.mapWithKey ngramsElementFromRepo
504 tableMap3 <- tableMap2 & v_data %%~ fmap NgramsTable
505 . setScores (not scoresNeeded)
508 liftBase $ hprint stderr
509 ("getTableNgrams total=" % timeSpecs
510 % " map1=" % timeSpecs
511 % " map2=" % timeSpecs
512 % " map3=" % timeSpecs
513 % " sql=" % (if scoresNeeded then "map2" else "map3")
515 ) t0 t3 t0 t1 t1 t2 t2 t3
519 scoresRecomputeTableNgrams :: forall env err m. (RepoCmdM env err m, HasNodeError err, HasConnectionPool env, HasConfig env) => NodeId -> TabType -> ListId -> m Int
520 scoresRecomputeTableNgrams nId tabType listId = do
521 tableMap <- getNgramsTableMap listId ngramsType
522 _ <- tableMap & v_data %%~ setScores
523 . Map.mapWithKey ngramsElementFromRepo
527 ngramsType = ngramsTypeFromTabType tabType
529 setScores :: forall t. Each t t NgramsElement NgramsElement => t -> m t
531 let ngrams_terms = unNgramsTerm <$> (table ^.. each . ne_ngrams)
532 occurrences <- getOccByNgramsOnlyFast' nId
537 setOcc ne = ne & ne_occurrences .~ sumOf (at (unNgramsTerm (ne ^. ne_ngrams)) . _Just) occurrences
539 pure $ table & each %~ setOcc
545 -- TODO: find a better place for the code above, All APIs stay here
547 data OrderBy = TermAsc | TermDesc | ScoreAsc | ScoreDesc
548 deriving (Generic, Enum, Bounded, Read, Show)
550 instance FromHttpApiData OrderBy
552 parseUrlPiece "TermAsc" = pure TermAsc
553 parseUrlPiece "TermDesc" = pure TermDesc
554 parseUrlPiece "ScoreAsc" = pure ScoreAsc
555 parseUrlPiece "ScoreDesc" = pure ScoreDesc
556 parseUrlPiece _ = Left "Unexpected value of OrderBy"
559 instance ToParamSchema OrderBy
560 instance FromJSON OrderBy
561 instance ToJSON OrderBy
562 instance ToSchema OrderBy
563 instance Arbitrary OrderBy
565 arbitrary = elements [minBound..maxBound]
567 needsScores :: Maybe OrderBy -> Bool
568 needsScores (Just ScoreAsc) = True
569 needsScores (Just ScoreDesc) = True
570 needsScores _ = False
572 type TableNgramsApiGet = Summary " Table Ngrams API Get"
573 :> QueryParamR "ngramsType" TabType
574 :> QueryParamR "list" ListId
575 :> QueryParamR "limit" Limit
576 :> QueryParam "offset" Offset
577 :> QueryParam "listType" ListType
578 :> QueryParam "minTermSize" MinSize
579 :> QueryParam "maxTermSize" MaxSize
580 :> QueryParam "orderBy" OrderBy
581 :> QueryParam "search" Text
582 :> Get '[JSON] (Versioned NgramsTable)
584 type TableNgramsApiPut = Summary " Table Ngrams API Change"
585 :> QueryParamR "ngramsType" TabType
586 :> QueryParamR "list" ListId
587 :> ReqBody '[JSON] (Versioned NgramsTablePatch)
588 :> Put '[JSON] (Versioned NgramsTablePatch)
590 type RecomputeScoresNgramsApiGet = Summary " Recompute scores for ngrams table"
591 :> QueryParamR "ngramsType" TabType
592 :> QueryParamR "list" ListId
593 :> "recompute" :> Post '[JSON] Int
595 type TableNgramsApiGetVersion = Summary " Table Ngrams API Get Version"
596 :> QueryParamR "ngramsType" TabType
597 :> QueryParamR "list" ListId
598 :> Get '[JSON] Version
600 type TableNgramsApi = TableNgramsApiGet
601 :<|> TableNgramsApiPut
602 :<|> RecomputeScoresNgramsApiGet
603 :<|> "version" :> TableNgramsApiGetVersion
605 getTableNgramsCorpus :: (RepoCmdM env err m, HasNodeError err, HasConnectionPool env, HasConfig env)
612 -> Maybe MinSize -> Maybe MaxSize
614 -> Maybe Text -- full text search
615 -> m (Versioned NgramsTable)
616 getTableNgramsCorpus nId tabType listId limit_ offset listType minSize maxSize orderBy mt =
617 getTableNgrams NodeCorpus nId tabType listId limit_ offset listType minSize maxSize orderBy searchQuery
619 searchQuery (NgramsTerm nt) = maybe (const True) isInfixOf mt nt
621 getTableNgramsVersion :: (RepoCmdM env err m, HasNodeError err, HasConnectionPool env, HasConfig env)
626 getTableNgramsVersion _nId _tabType _listId = currentVersion
628 -- Versioned { _v_version = v } <- getTableNgramsCorpus nId tabType listId 100000 Nothing Nothing Nothing Nothing Nothing Nothing
629 -- This line above looks like a waste of computation to finally get only the version.
630 -- See the comment about listNgramsChangedSince.
633 -- | Text search is deactivated for now for ngrams by doc only
634 getTableNgramsDoc :: (RepoCmdM env err m, HasNodeError err, HasConnectionPool env, HasConfig env)
636 -> ListId -> Limit -> Maybe Offset
638 -> Maybe MinSize -> Maybe MaxSize
640 -> Maybe Text -- full text search
641 -> m (Versioned NgramsTable)
642 getTableNgramsDoc dId tabType listId limit_ offset listType minSize maxSize orderBy _mt = do
643 ns <- selectNodesWithUsername NodeList userMaster
644 let ngramsType = ngramsTypeFromTabType tabType
645 ngs <- selectNgramsByDoc (ns <> [listId]) dId ngramsType
646 let searchQuery (NgramsTerm nt) = flip S.member (S.fromList ngs) nt
647 getTableNgrams NodeDocument dId tabType listId limit_ offset listType minSize maxSize orderBy searchQuery
651 apiNgramsTableCorpus :: ( RepoCmdM env err m
653 , HasInvalidError err
654 , HasConnectionPool env
658 => NodeId -> ServerT TableNgramsApi m
659 apiNgramsTableCorpus cId = getTableNgramsCorpus cId
661 :<|> scoresRecomputeTableNgrams cId
662 :<|> getTableNgramsVersion cId
664 apiNgramsTableDoc :: ( RepoCmdM env err m
666 , HasInvalidError err
667 , HasConnectionPool env
671 => DocId -> ServerT TableNgramsApi m
672 apiNgramsTableDoc dId = getTableNgramsDoc dId
674 :<|> scoresRecomputeTableNgrams dId
675 :<|> getTableNgramsVersion dId
676 -- > index all the corpus accordingly (TODO AD)
678 -- Did the given list of ngrams changed since the given version?
679 -- The returned value is versioned boolean value, meaning that one always retrieve the
681 -- If the given version is negative then one simply receive the latest version and True.
682 -- Using this function is more precise than simply comparing the latest version number
683 -- with the local version number. Indeed there might be no change to this particular list
684 -- and still the version number has changed because of other lists.
686 -- Here the added value is to make a compromise between precision, computation, and bandwidth:
687 -- * currentVersion: good computation, good bandwidth, bad precision.
688 -- * listNgramsChangedSince: good precision, good bandwidth, bad computation.
689 -- * tableNgramsPull: good precision, good bandwidth (if you use the received data!), bad computation.
690 listNgramsChangedSince :: RepoCmdM env err m
691 => ListId -> TableNgrams.NgramsType -> Version -> m (Versioned Bool)
692 listNgramsChangedSince listId ngramsType version
694 Versioned <$> currentVersion <*> pure True
696 tableNgramsPull listId ngramsType version & mapped . v_data %~ (== mempty)