{-# OPTIONS_GHC -fno-warn-orphans #-}
-{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE ConstrainedClassMethods #-}
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE InstanceSigs #-}
+{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
, allDataOrigins
, do_api
+ , indexAllDocumentsWithPosTag
)
where
import Data.Maybe (catMaybes)
import Data.Monoid
import Data.Swagger
-import Data.Text (splitOn, intercalate)
+import Data.Text (splitOn)
import Data.Traversable (traverse)
import Data.Tuple.Extra (first, second)
import GHC.Generics (Generic)
import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
import qualified Data.Map as Map
-import Gargantext.Core (Lang(..))
+import Gargantext.Core (Lang(..), PosTagAlgo(..))
import Gargantext.Core.Ext.IMT (toSchoolName)
import Gargantext.Core.Ext.IMTUser (deserialiseImtUsersFromFile)
import Gargantext.Core.Flow.Types
import Gargantext.Core.Text
-import Gargantext.Core.Text.List.Group.WithStem (StopSize(..), GroupParams(..))
+import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
import Gargantext.Core.Text.List (buildNgramsLists)
import Gargantext.Core.Text.Terms
import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
-import Gargantext.Core.Types (Terms(..))
+import Gargantext.Core.Types (POS(NP))
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Types.Main
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
import Gargantext.Database.Query.Table.NodeNodeNgrams2
import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
-import Gargantext.Database.Schema.Node (NodePoly(..))
+import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
import Gargantext.Database.Types
import Gargantext.Prelude
import Gargantext.Prelude.Crypto.Hash (Hash)
import qualified Gargantext.Core.Text.Corpus.API as API
import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
+------------------------------------------------------------------------
+-- Impots for upgrade function
+import Gargantext.Database.Query.Tree.Root (getRootId)
+import Gargantext.Database.Query.Tree (findNodesId)
+import qualified Data.List as List
------------------------------------------------------------------------
-- TODO use internal with API name (could be old data)
data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
-- User List Flow
(masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
- ngs <- buildNgramsLists user (GroupParams l 2 3 (StopSize 3)) userCorpusId masterCorpusId
+
+ --let gp = (GroupParams l 2 3 (StopSize 3))
+ let gp = GroupWithPosTag l CoreNLP HashMap.empty
+ ngs <- buildNgramsLists gp user userCorpusId masterCorpusId
+
_userListId <- flowList_DbRepo listId ngs
_mastListId <- getOrMkList masterCorpusId masterUserId
-- _ <- insertOccsUpdates userCorpusId mastListId
$ catMaybes [ NodeNodeNgrams2 <$> Just nId
<*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
<*> Just (fromIntegral w :: Double)
- | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
+ | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
, (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
- , (nId, w) <- Map.toList mapNodeIdWeight
+ , (nId, w) <- Map.toList mapNodeIdWeight
]
-- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
$ maybe ["Nothing"] (splitOn ", ")
$ _hd_authors doc
- terms' <- map text2ngrams
- <$> map (intercalate " " . _terms_label)
+ terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
<$> concat
<$> liftBase (extractTerms lang' $ hasText doc)
- pure $ HashMap.fromList $ [(SimpleNgrams source, Map.singleton Sources 1)]
- <> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
- <> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
- <> [(SimpleNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
+ pure $ HashMap.fromList
+ $ [(SimpleNgrams source, Map.singleton Sources 1) ]
+ <> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
+ <> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
+ <> [(EnrichedNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
where
hasText (Node _ _ _ _ _ _ _ h) = hasText h
+
+-- | TODO putelsewhere
+-- | Upgrade function
+-- Suppose all documents are English (this is the case actually)
+indexAllDocumentsWithPosTag :: FlowCmdM env err m => m ()
+indexAllDocumentsWithPosTag = do
+ rootId <- getRootId (UserName userMaster)
+ corpusIds <- findNodesId rootId [NodeCorpus]
+ docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
+
+ _ <- mapM extractInsert (splitEvery 1000 docs)
+
+ pure ()
+
+extractInsert :: FlowCmdM env err m => [Node HyperdataDocument] -> m ()
+extractInsert docs = do
+ let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
+
+ mapNgramsDocs' <- mapNodeIdNgrams
+ <$> documentIdWithNgrams
+ (extractNgramsT $ withLang (Multi EN) documentsWithId)
+ documentsWithId
+
+ _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
+
+ pure ()
+
+
+
+
+
+
+
+