2 Module : Gargantext.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
16 module Gargantext.Text.List
21 import Data.Text (Text)
22 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
23 import Gargantext.Core (Lang(..))
24 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
25 import Gargantext.Database.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
26 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
27 import Gargantext.Database.Utils (Cmd)
28 import Gargantext.Text.List.Learn (Model(..))
29 import Gargantext.Prelude
30 --import Gargantext.Text.Terms (TermType(..))
31 import qualified Data.Char as Char
32 import qualified Data.List as List
33 import qualified Data.Map as Map
34 import qualified Data.Set as Set
35 import qualified Data.Text as Text
38 data NgramsListBuilder = BuilderStepO { stemSize :: Int
42 | BuilderStep1 { withModel :: Model }
43 | BuilderStepN { withModel :: Model }
47 data StopSize = StopSize {unStopSize :: Int}
49 -- | TODO improve grouping functions of Authors, Sources, Institutes..
50 buildNgramsLists :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
51 -> Cmd err (Map NgramsType [NgramsElement])
52 buildNgramsLists l n m s uCid mCid = do
53 ngTerms <- buildNgramsTermsList l n m s uCid mCid
54 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
55 pure $ Map.unions $ othersTerms <> [ngTerms]
58 buildNgramsOthersList :: UserCorpusId -> (Text -> Text) -> NgramsType
59 -> Cmd err (Map NgramsType [NgramsElement])
60 buildNgramsOthersList uCid groupIt nt = do
61 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
63 pure $ Map.fromList [(nt, [ mkNgramsElement t CandidateTerm Nothing (mSetFromList [])
64 | (t,_ns) <- Map.toList ngs
69 -- TODO remove hard coded parameters
70 buildNgramsTermsList :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
71 -> Cmd err (Map NgramsType [NgramsElement])
72 buildNgramsTermsList l n m s uCid mCid = do
73 candidates <- sortTficf <$> getTficf' uCid mCid NgramsTerms (ngramsGroup l n m)
74 --printDebug "candidate" (length candidates)
76 let termList = toTermList ((isStopTerm s) . fst) candidates
77 --let termList = toTermList ((\_ -> False) . fst) candidates
78 --printDebug "termlist" (length termList)
80 let ngs = List.concat $ map toNgramsElement termList
82 pure $ Map.fromList [(NgramsTerms, ngs)]
85 toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement]
86 toNgramsElement (listType, (_stem, (_score, setNgrams))) =
87 case Set.toList setNgrams of
89 (parent:children) -> [parentElem] <> childrenElems
91 parentElem = mkNgramsElement parent
94 (mSetFromList children)
95 childrenElems = map (\t -> mkNgramsElement t listType
96 (Just $ RootParent parent parent)
100 -- TODO remove hard coded parameters
101 toTermList :: (a -> Bool) -> [a] -> [(ListType, a)]
102 toTermList stop ns = map (toTermList' stop CandidateTerm) xs
103 <> map (toTermList' stop GraphTerm) ys
104 <> map (toTermList' stop CandidateTerm) zs
106 toTermList' stop' l n = case stop' n of
107 True -> (StopTerm, n)
110 -- TODO use % of size of list
113 ys = take b $ drop a ns
114 zs = drop b $ drop a ns
119 isStopTerm :: StopSize -> Text -> Bool
120 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
122 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)