]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/List.hs
phyloQuality
[gargantext.git] / src / Gargantext / Text / List.hs
1 {-|
2 Module : Gargantext.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15
16 module Gargantext.Text.List
17 where
18
19 import Data.Either (partitionEithers, Either(..))
20 import Debug.Trace (trace)
21 import Data.Map (Map)
22 import Data.Set (Set)
23 import Data.Text (Text)
24 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
25 import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
26 import Gargantext.Core (Lang(..))
27 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, NodeId)
28 import Gargantext.Database.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
29 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
30 import Gargantext.Database.Utils (Cmd)
31 import Gargantext.Text.List.Learn (Model(..))
32 import Gargantext.Text.Metrics (takeScored)
33 import Gargantext.Prelude
34 --import Gargantext.Text.Terms (TermType(..))
35 import qualified Data.Char as Char
36 import qualified Data.List as List
37 import qualified Data.Map as Map
38 import qualified Data.Set as Set
39 import qualified Data.Text as Text
40
41
42 data NgramsListBuilder = BuilderStepO { stemSize :: Int
43 , stemX :: Int
44 , stopSize :: Int
45 }
46 | BuilderStep1 { withModel :: Model }
47 | BuilderStepN { withModel :: Model }
48 | Tficf { nlb_lang :: Lang
49 , nlb_group1 :: Int
50 , nlb_group2 :: Int
51 , nlb_stopSize :: StopSize
52 , nlb_userCorpusId :: UserCorpusId
53 , nlb_masterCorpusId :: MasterCorpusId
54 }
55
56
57 data StopSize = StopSize {unStopSize :: Int}
58
59 -- | TODO improve grouping functions of Authors, Sources, Institutes..
60 buildNgramsLists :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
61 -> Cmd err (Map NgramsType [NgramsElement])
62 buildNgramsLists l n m s uCid mCid = do
63 ngTerms <- buildNgramsTermsList l n m s uCid mCid
64 --ngTerms <- buildNgramsTermsList' uCid (ngramsGroup l n m) (isStopTerm s . fst) 550 300
65 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
66 pure $ Map.unions $ othersTerms <> [ngTerms]
67
68
69 buildNgramsOthersList :: UserCorpusId -> (Text -> Text) -> NgramsType
70 -> Cmd err (Map NgramsType [NgramsElement])
71 buildNgramsOthersList uCid groupIt nt = do
72 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
73
74 let
75 all' = Map.toList ngs
76 pure $ (toElements GraphTerm $ take 10 all') <> (toElements CandidateTerm $ drop 10 all')
77 where
78 toElements nType x = Map.fromList [(nt, [ mkNgramsElement t nType Nothing (mSetFromList [])
79 | (t,_ns) <- x
80 ]
81 )
82 ]
83
84 --{-
85 buildNgramsTermsList' :: UserCorpusId
86 -> (Text -> Text)
87 -> ((Text, (Set Text, Set NodeId)) -> Bool) -> Int -> Int
88 -> Cmd err (Map NgramsType [NgramsElement])
89 --}
90 buildNgramsTermsList' uCid groupIt stop gls is = do
91 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid NgramsTerms
92
93 let
94 (stops, candidates) = partitionEithers
95 $ map (\t -> if stop t then Left t else Right t)
96 $ Map.toList
97 $ Map.filter ((\s' -> Set.size s' > 1) . snd) ngs
98
99 (maps, candidates') = takeScored gls is
100 $ getCoocByNgrams' snd (Diagonal True)
101 $ Map.fromList candidates
102
103
104 toList' t = (fst t, (fromIntegral $ Set.size $ snd $ snd t, fst $ snd t))
105
106 (s,c,m) = (stops
107 , List.filter (\(k,_) -> List.elem k candidates') candidates
108 , List.filter (\(k,_) -> List.elem k maps) candidates
109 )
110
111 let ngs' = List.concat
112 $ map toNgramsElement
113 $ map (\t -> (StopTerm, toList' t)) s
114 <> map (\t -> (CandidateTerm, toList' t)) c
115 <> map (\t -> (GraphTerm, toList' t)) m
116
117 pure $ Map.fromList [(NgramsTerms, ngs')]
118
119
120 buildNgramsTermsList :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
121 -> Cmd err (Map NgramsType [NgramsElement])
122 buildNgramsTermsList l n m s uCid mCid = do
123 candidates <- sortTficf <$> getTficf' uCid mCid NgramsTerms (ngramsGroup l n m)
124 let
125 candidatesSize = 2000
126 a = 500
127 b = 500
128 candidatesHead = List.take candidatesSize candidates
129 candidatesTail = List.drop candidatesSize candidates
130 termList = (toTermList a b ((isStopTerm s) . fst) candidatesHead)
131 <> (map (toList ((isStopTerm s) .fst) CandidateTerm) candidatesTail)
132 let ngs = List.concat $ map toNgramsElement termList
133
134 pure $ Map.fromList [(NgramsTerms, ngs)]
135
136
137 toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement]
138 toNgramsElement (listType, (_stem, (_score, setNgrams))) =
139 case Set.toList setNgrams of
140 [] -> []
141 (parent:children) -> [parentElem] <> childrenElems
142 where
143 parentElem = mkNgramsElement parent
144 listType
145 Nothing
146 (mSetFromList children)
147 childrenElems = map (\t -> mkNgramsElement t listType
148 (Just $ RootParent parent parent)
149 (mSetFromList [])
150 ) children
151
152
153 toList :: (b -> Bool) -> ListType -> b -> (ListType, b)
154 toList stop l n = case stop n of
155 True -> (StopTerm, n)
156 False -> (l, n)
157
158
159 toTermList :: Int -> Int -> (a -> Bool) -> [a] -> [(ListType, a)]
160 toTermList _ _ _ [] = []
161 toTermList a b stop ns = trace ("computing toTermList") $
162 map (toList stop CandidateTerm) xs
163 <> map (toList stop GraphTerm) ys
164 <> toTermList a b stop zs
165 where
166 xs = take a ns
167 ta = drop a ns
168
169 ys = take b ta
170 zs = drop b ta
171
172
173 isStopTerm :: StopSize -> Text -> Bool
174 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
175 where
176 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)