2 Module : Gargantext.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15 {-# LANGUAGE RankNTypes #-}
17 module Gargantext.Text.List
20 import Data.Either (partitionEithers, Either(..))
23 import Data.Text (Text)
24 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
25 import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
26 import Gargantext.Core (Lang(..))
27 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, NodeId)
28 import Gargantext.Database.Action.Metrics.NgramsByNode (getTficf, sortTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
29 import Gargantext.Database.Admin.Utils (Cmd)
30 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
31 import Gargantext.Prelude
32 import Gargantext.Text.List.Learn (Model(..))
33 import Gargantext.Text.Metrics (takeScored)
34 import qualified Data.Char as Char
35 import qualified Data.List as List
36 import qualified Data.Map as Map
37 import qualified Data.Set as Set
38 import qualified Data.Text as Text
41 data NgramsListBuilder = BuilderStepO { stemSize :: Int
45 | BuilderStep1 { withModel :: Model }
46 | BuilderStepN { withModel :: Model }
47 | Tficf { nlb_lang :: Lang
50 , nlb_stopSize :: StopSize
51 , nlb_userCorpusId :: UserCorpusId
52 , nlb_masterCorpusId :: MasterCorpusId
56 data StopSize = StopSize {unStopSize :: Int}
58 -- | TODO improve grouping functions of Authors, Sources, Institutes..
59 buildNgramsLists :: Lang
65 -> Cmd err (Map NgramsType [NgramsElement])
66 buildNgramsLists l n m s uCid mCid = do
67 ngTerms <- buildNgramsTermsList l n m s uCid mCid
68 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
69 pure $ Map.unions $ othersTerms <> [ngTerms]
72 buildNgramsOthersList :: UserCorpusId
75 -> Cmd err (Map NgramsType [NgramsElement])
76 buildNgramsOthersList uCid groupIt nt = do
77 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
81 all' = List.reverse $ List.sortOn (Set.size . snd . snd) $ Map.toList ngs
82 graphTerms = List.take listSize all'
83 candiTerms = List.drop listSize all'
84 pure $ Map.unionsWith (<>) [ toElements GraphTerm graphTerms
85 , toElements CandidateTerm candiTerms]
87 toElements nType x = Map.fromList [(nt, [ mkNgramsElement t nType Nothing (mSetFromList [])
94 buildNgramsTermsList' :: UserCorpusId
96 -> ((Text, (Set Text, Set NodeId)) -> Bool)
99 -> Cmd err (Map NgramsType [NgramsElement])
101 buildNgramsTermsList' uCid groupIt stop gls is = do
102 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid NgramsTerms
105 (stops, candidates) = partitionEithers
106 $ map (\t -> if stop t then Left t else Right t)
108 $ Map.filter ((\s' -> Set.size s' > 1) . snd) ngs
110 (maps, candidates') = takeScored gls is
111 $ getCoocByNgrams' snd (Diagonal True)
112 $ Map.fromList candidates
115 toList' t = (fst t, (fromIntegral $ Set.size $ snd $ snd t, fst $ snd t))
118 , List.filter (\(k,_) -> List.elem k candidates') candidates
119 , List.filter (\(k,_) -> List.elem k maps) candidates
122 let ngs' = List.concat
123 $ map toNgramsElement
124 $ map (\t -> (StopTerm, toList' t)) s
125 <> map (\t -> (CandidateTerm, toList' t)) c
126 <> map (\t -> (GraphTerm, toList' t)) m
128 pure $ Map.fromList [(NgramsTerms, ngs')]
131 buildNgramsTermsList :: Lang
137 -> Cmd err (Map NgramsType [NgramsElement])
138 buildNgramsTermsList l n m s uCid mCid = do
139 candidates <- sortTficf <$> getTficf uCid mCid NgramsTerms (ngramsGroup l n m)
142 candidatesSize = 2000
147 candidatesHead = List.take candidatesSize candidates
148 candidatesTail = List.drop candidatesSize candidates
150 termList = (toTermList a b ((isStopTerm s) . fst) candidatesHead)
151 <> (map (toList ((isStopTerm s) .fst) CandidateTerm) candidatesTail)
153 ngs = List.concat $ map toNgramsElement termList
155 pure $ Map.fromList [(NgramsTerms, ngs)]
158 toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement]
159 toNgramsElement (listType, (_stem, (_score, setNgrams))) =
160 case Set.toList setNgrams of
162 (parent:children) -> [parentElem] <> childrenElems
164 parentElem = mkNgramsElement parent
167 (mSetFromList children)
168 childrenElems = map (\t -> mkNgramsElement t listType
169 (Just $ RootParent parent parent)
174 toList :: (b -> Bool) -> ListType -> b -> (ListType, b)
175 toList stop l n = case stop n of
176 True -> (StopTerm, n)
180 toTermList :: Int -> Int -> (a -> Bool) -> [a] -> [(ListType, a)]
181 toTermList _ _ _ [] = []
182 toTermList a b stop ns = -- trace ("computing toTermList") $
183 map (toList stop CandidateTerm) xs
184 <> map (toList stop GraphTerm) ys
185 <> toTermList a b stop zs
194 isStopTerm :: StopSize -> Text -> Bool
195 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
197 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)