2 Module : Gargantext.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15 {-# LANGUAGE RankNTypes #-}
17 module Gargantext.Text.List
20 -- import Data.Either (partitionEithers, Either(..))
23 import Data.Text (Text)
24 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
25 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
26 import Gargantext.Core (Lang(..))
27 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, Ordering(..))
28 import Gargantext.Database.Action.Metrics.NgramsByNode (getTficf, sortTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
29 import Gargantext.Database.Admin.Utils (Cmd)
30 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
31 import Gargantext.Prelude
32 import Gargantext.Text.List.Learn (Model(..))
33 -- import Gargantext.Text.Metrics (takeScored)
34 import qualified Data.Char as Char
35 import qualified Data.List as List
36 import qualified Data.Map as Map
37 import qualified Data.Set as Set
38 import qualified Data.Text as Text
41 data NgramsListBuilder = BuilderStepO { stemSize :: Int
45 | BuilderStep1 { withModel :: Model }
46 | BuilderStepN { withModel :: Model }
47 | Tficf { nlb_lang :: Lang
50 , nlb_stopSize :: StopSize
51 , nlb_userCorpusId :: UserCorpusId
52 , nlb_masterCorpusId :: MasterCorpusId
56 data StopSize = StopSize {unStopSize :: Int}
58 -- | TODO improve grouping functions of Authors, Sources, Institutes..
59 buildNgramsLists :: Lang
65 -> Cmd err (Map NgramsType [NgramsElement])
66 buildNgramsLists l n m s uCid mCid = do
67 ngTerms <- buildNgramsTermsList l n m s uCid mCid
68 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
69 pure $ Map.unions $ othersTerms <> [ngTerms]
72 buildNgramsOthersList :: UserCorpusId
75 -> Cmd err (Map NgramsType [NgramsElement])
76 buildNgramsOthersList uCid groupIt nt = do
77 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
81 all' = List.reverse $ List.sortOn (Set.size . snd . snd) $ Map.toList ngs
83 graphTerms = List.take listSize all'
84 candiTerms = List.drop listSize all'
86 pure $ Map.unionsWith (<>) [ toElements GraphTerm graphTerms
87 , toElements CandidateTerm candiTerms
91 Map.fromList [(nt, [ mkNgramsElement t nType Nothing (mSetFromList [])
97 buildNgramsTermsList' :: UserCorpusId
99 -> ((Text, (Set Text, Set NodeId)) -> Bool)
102 -> Cmd err (Map NgramsType [NgramsElement])
104 buildNgramsTermsList' uCid groupIt stop gls is = do
105 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid NgramsTerms
108 (stops, candidates) = partitionEithers
109 $ map (\t -> if stop t then Left t else Right t)
111 $ Map.filter ((\s' -> Set.size s' > 1) . snd) ngs
113 (maps, candidates') = takeScored gls is
114 $ getCoocByNgrams' snd (Diagonal True)
115 $ Map.fromList candidates
118 toList' t = (fst t, (fromIntegral $ Set.size $ snd $ snd t, fst $ snd t))
121 , List.filter (\(k,_) -> List.elem k candidates') candidates
122 , List.filter (\(k,_) -> List.elem k maps) candidates
125 let ngs' = List.concat
126 $ map toNgramsElement
127 $ map (\t -> (StopTerm , toList' t)) s
128 <> map (\t -> (CandidateTerm, toList' t)) c
129 <> map (\t -> (GraphTerm , toList' t)) m
131 pure $ Map.fromList [(NgramsTerms, ngs')]
137 buildNgramsTermsList :: Lang
143 -> Cmd err (Map NgramsType [NgramsElement])
144 buildNgramsTermsList l n m s uCid mCid = do
145 candidates <- sortTficf Down <$> getTficf uCid mCid NgramsTerms (ngramsGroup l n m)
153 candidatesHead = List.take candidatesSize candidates
154 candidatesTail = List.drop candidatesSize candidates
157 -- (toTermList a b ((isStopTerm s) . fst) candidatesHead)
158 (map (toGargList ((isStopTerm s) .fst) GraphTerm) candidatesHead)
159 <> (map (toGargList ((isStopTerm s) .fst) CandidateTerm) candidatesTail)
161 ngs = List.concat $ map toNgramsElement termList
163 pure $ Map.fromList [(NgramsTerms, ngs)]
171 toTermList _ _ _ [] = []
172 toTermList a b stop ns = -- trace ("computing toTermList") $
173 map (toGargList stop CandidateTerm) xs
174 <> map (toGargList stop GraphTerm) ys
175 <> toTermList a b stop zs
184 toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement]
185 toNgramsElement (listType, (_stem, (_score, setNgrams))) =
186 case Set.toList setNgrams of
188 (parent:children) -> [parentElem] <> childrenElems
190 parentElem = mkNgramsElement parent
193 (mSetFromList children)
194 childrenElems = map (\t -> mkNgramsElement t listType
195 (Just $ RootParent parent parent)
200 toGargList :: (b -> Bool) -> ListType -> b -> (ListType, b)
201 toGargList stop l n = case stop n of
202 True -> (StopTerm, n)
207 isStopTerm :: StopSize -> Text -> Bool
208 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
210 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)