2 Module : Gargantext.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
16 module Gargantext.Text.List
21 import Data.Text (Text)
22 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
23 import Gargantext.Core (Lang(..))
24 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
25 import Gargantext.Database.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
26 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
27 import Gargantext.Database.Utils (Cmd)
28 import Gargantext.Text.List.Learn (Model(..))
29 import Gargantext.Prelude
30 --import Gargantext.Text.Terms (TermType(..))
31 import qualified Data.Char as Char
32 import qualified Data.List as List
33 import qualified Data.Map as Map
34 import qualified Data.Set as Set
35 import qualified Data.Text as Text
38 data NgramsListBuilder = BuilderStepO { stemSize :: Int
42 | BuilderStep1 { withModel :: Model }
43 | BuilderStepN { withModel :: Model }
46 data StopSize = StopSize {unStopSize :: Int}
48 -- | TODO improve grouping functions of Authors, Sources, Institutes..
49 buildNgramsLists :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
50 -> Cmd err (Map NgramsType [NgramsElement])
51 buildNgramsLists l n m s uCid mCid = do
52 ngTerms <- buildNgramsTermsList l n m s uCid mCid
53 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
54 pure $ Map.unions $ othersTerms <> [ngTerms]
57 buildNgramsOthersList :: UserCorpusId -> (Text -> Text) -> NgramsType
58 -> Cmd err (Map NgramsType [NgramsElement])
59 buildNgramsOthersList uCid groupIt nt = do
60 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
64 pure $ (toElements GraphTerm $ take 10 all') <> (toElements CandidateTerm $ drop 10 all')
66 toElements nType x = Map.fromList [(nt, [ mkNgramsElement t nType Nothing (mSetFromList [])
72 buildNgramsTermsList :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
73 -> Cmd err (Map NgramsType [NgramsElement])
74 buildNgramsTermsList l n m s uCid mCid = do
75 candidates <- sortTficf <$> getTficf' uCid mCid NgramsTerms (ngramsGroup l n m)
76 let termList = toTermList ((isStopTerm s) . fst) candidates
77 let ngs = List.concat $ map toNgramsElement termList
79 pure $ Map.fromList [(NgramsTerms, ngs)]
82 toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement]
83 toNgramsElement (listType, (_stem, (_score, setNgrams))) =
84 case Set.toList setNgrams of
86 (parent:children) -> [parentElem] <> childrenElems
88 parentElem = mkNgramsElement parent
91 (mSetFromList children)
92 childrenElems = map (\t -> mkNgramsElement t listType
93 (Just $ RootParent parent parent)
97 -- TODO remove hard coded parameters
98 toTermList :: (a -> Bool) -> [a] -> [(ListType, a)]
99 toTermList stop ns = map (toTermList' stop CandidateTerm) xs
100 <> map (toTermList' stop GraphTerm) ys
101 <> map (toTermList' stop CandidateTerm) zs
103 toTermList' stop' l n = case stop' n of
104 True -> (StopTerm, n)
107 -- TODO use % of size of list
110 ys = take b $ drop a ns
111 zs = drop b $ drop a ns
116 isStopTerm :: StopSize -> Text -> Bool
117 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
119 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)