2 Module : Gargantext.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
16 module Gargantext.Text.List
21 import Data.Text (Text)
22 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
23 import Gargantext.Core (Lang(..))
24 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
25 import Gargantext.Database.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
26 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
27 import Gargantext.Database.Utils (Cmd)
28 import Gargantext.Text.List.Learn (Model(..))
29 import Gargantext.Prelude
30 --import Gargantext.Text.Terms (TermType(..))
31 import qualified Data.Char as Char
32 import qualified Data.List as List
33 import qualified Data.Map as Map
34 import qualified Data.Set as Set
35 import qualified Data.Text as Text
38 data NgramsListBuilder = BuilderStepO { stemSize :: Int
42 | BuilderStep1 { withModel :: Model }
43 | BuilderStepN { withModel :: Model }
47 data StopSize = StopSize {unStopSize :: Int}
49 -- | TODO improve grouping functions of Authors, Sources, Institutes..
50 buildNgramsLists :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
51 -> Cmd err (Map NgramsType [NgramsElement])
52 buildNgramsLists l n m s uCid mCid = do
53 ngTerms <- buildNgramsTermsList l n m s uCid mCid
54 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
55 pure $ Map.unions $ othersTerms <> [ngTerms]
58 buildNgramsOthersList :: UserCorpusId -> (Text -> Text) -> NgramsType
59 -> Cmd err (Map NgramsType [NgramsElement])
60 buildNgramsOthersList uCid groupIt nt = do
61 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
65 pure $ (toElements GraphTerm $ take 10 all') <> (toElements CandidateTerm $ drop 10 all')
67 toElements nType x = Map.fromList [(nt, [ mkNgramsElement t nType Nothing (mSetFromList [])
73 buildNgramsTermsList :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
74 -> Cmd err (Map NgramsType [NgramsElement])
75 buildNgramsTermsList l n m s uCid mCid = do
76 candidates <- sortTficf <$> getTficf' uCid mCid NgramsTerms (ngramsGroup l n m)
77 let termList = toTermList ((isStopTerm s) . fst) candidates
78 let ngs = List.concat $ map toNgramsElement termList
80 pure $ Map.fromList [(NgramsTerms, ngs)]
83 toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement]
84 toNgramsElement (listType, (_stem, (_score, setNgrams))) =
85 case Set.toList setNgrams of
87 (parent:children) -> [parentElem] <> childrenElems
89 parentElem = mkNgramsElement parent
92 (mSetFromList children)
93 childrenElems = map (\t -> mkNgramsElement t listType
94 (Just $ RootParent parent parent)
98 -- TODO remove hard coded parameters
99 toTermList :: (a -> Bool) -> [a] -> [(ListType, a)]
100 toTermList stop ns = map (toTermList' stop CandidateTerm) xs
101 <> map (toTermList' stop GraphTerm) ys
102 <> map (toTermList' stop CandidateTerm) zs
104 toTermList' stop' l n = case stop' n of
105 True -> (StopTerm, n)
108 -- TODO use % of size of list
111 ys = take b $ drop a ns
112 zs = drop b $ drop a ns
117 isStopTerm :: StopSize -> Text -> Bool
118 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
120 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)