2 Module : Gargantext.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15 {-# LANGUAGE RankNTypes #-}
17 module Gargantext.Text.List
20 import Data.Either (partitionEithers, Either(..))
21 -- import Debug.Trace (trace)
24 import Data.Text (Text)
25 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
26 import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
27 import Gargantext.Core (Lang(..))
28 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, NodeId)
29 import Gargantext.Database.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
30 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
31 import Gargantext.Database.Utils (Cmd)
32 import Gargantext.Text.List.Learn (Model(..))
33 import Gargantext.Text.Metrics (takeScored)
34 import Gargantext.Prelude
35 --import Gargantext.Text.Terms (TermType(..))
36 import qualified Data.Char as Char
37 import qualified Data.List as List
38 import qualified Data.Map as Map
39 import qualified Data.Set as Set
40 import qualified Data.Text as Text
43 data NgramsListBuilder = BuilderStepO { stemSize :: Int
47 | BuilderStep1 { withModel :: Model }
48 | BuilderStepN { withModel :: Model }
49 | Tficf { nlb_lang :: Lang
52 , nlb_stopSize :: StopSize
53 , nlb_userCorpusId :: UserCorpusId
54 , nlb_masterCorpusId :: MasterCorpusId
58 data StopSize = StopSize {unStopSize :: Int}
60 -- | TODO improve grouping functions of Authors, Sources, Institutes..
61 buildNgramsLists :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
62 -> Cmd err (Map NgramsType [NgramsElement])
63 buildNgramsLists l n m s uCid mCid = do
64 ngTerms <- buildNgramsTermsList l n m s uCid mCid
65 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
66 pure $ Map.unions $ othersTerms <> [ngTerms]
69 buildNgramsOthersList :: UserCorpusId -> (Text -> Text) -> NgramsType
70 -> Cmd err (Map NgramsType [NgramsElement])
71 buildNgramsOthersList uCid groupIt nt = do
72 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
76 all' = List.reverse $ List.sortOn (Set.size . snd . snd) $ Map.toList ngs
77 graphTerms = List.take listSize all'
78 candiTerms = List.drop listSize all'
79 pure $ Map.unionsWith (<>) [ toElements GraphTerm graphTerms
80 , toElements CandidateTerm candiTerms]
82 toElements nType x = Map.fromList [(nt, [ mkNgramsElement t nType Nothing (mSetFromList [])
89 buildNgramsTermsList' :: UserCorpusId
91 -> ((Text, (Set Text, Set NodeId)) -> Bool) -> Int -> Int
92 -> Cmd err (Map NgramsType [NgramsElement])
94 buildNgramsTermsList' uCid groupIt stop gls is = do
95 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid NgramsTerms
98 (stops, candidates) = partitionEithers
99 $ map (\t -> if stop t then Left t else Right t)
101 $ Map.filter ((\s' -> Set.size s' > 1) . snd) ngs
103 (maps, candidates') = takeScored gls is
104 $ getCoocByNgrams' snd (Diagonal True)
105 $ Map.fromList candidates
108 toList' t = (fst t, (fromIntegral $ Set.size $ snd $ snd t, fst $ snd t))
111 , List.filter (\(k,_) -> List.elem k candidates') candidates
112 , List.filter (\(k,_) -> List.elem k maps) candidates
115 let ngs' = List.concat
116 $ map toNgramsElement
117 $ map (\t -> (StopTerm, toList' t)) s
118 <> map (\t -> (CandidateTerm, toList' t)) c
119 <> map (\t -> (GraphTerm, toList' t)) m
121 pure $ Map.fromList [(NgramsTerms, ngs')]
124 buildNgramsTermsList :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
125 -> Cmd err (Map NgramsType [NgramsElement])
126 buildNgramsTermsList l n m s uCid mCid = do
127 candidates <- sortTficf <$> getTficf' uCid mCid NgramsTerms (ngramsGroup l n m)
129 candidatesSize = 2000
132 candidatesHead = List.take candidatesSize candidates
133 candidatesTail = List.drop candidatesSize candidates
134 termList = (toTermList a b ((isStopTerm s) . fst) candidatesHead)
135 <> (map (toList ((isStopTerm s) .fst) CandidateTerm) candidatesTail)
136 let ngs = List.concat $ map toNgramsElement termList
138 pure $ Map.fromList [(NgramsTerms, ngs)]
141 toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement]
142 toNgramsElement (listType, (_stem, (_score, setNgrams))) =
143 case Set.toList setNgrams of
145 (parent:children) -> [parentElem] <> childrenElems
147 parentElem = mkNgramsElement parent
150 (mSetFromList children)
151 childrenElems = map (\t -> mkNgramsElement t listType
152 (Just $ RootParent parent parent)
157 toList :: (b -> Bool) -> ListType -> b -> (ListType, b)
158 toList stop l n = case stop n of
159 True -> (StopTerm, n)
163 toTermList :: Int -> Int -> (a -> Bool) -> [a] -> [(ListType, a)]
164 toTermList _ _ _ [] = []
165 toTermList a b stop ns = -- trace ("computing toTermList") $
166 map (toList stop CandidateTerm) xs
167 <> map (toList stop GraphTerm) ys
168 <> toTermList a b stop zs
177 isStopTerm :: StopSize -> Text -> Bool
178 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
180 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)