]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/List.hs
[REFACTO] ngrams unsupervised.
[gargantext.git] / src / Gargantext / Text / List.hs
1 {-|
2 Module : Gargantext.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15
16 module Gargantext.Text.List
17 where
18
19 import Data.Map (Map)
20 import Data.Set (Set)
21 import Data.Text (Text)
22 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
23 import Gargantext.Core (Lang(..))
24 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
25 import Gargantext.Database.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
26 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
27 import Gargantext.Database.Utils (Cmd)
28 import Gargantext.Text.List.Learn (Model(..))
29 import Gargantext.Prelude
30 --import Gargantext.Text.Terms (TermType(..))
31 import qualified Data.Char as Char
32 import qualified Data.List as List
33 import qualified Data.Map as Map
34 import qualified Data.Set as Set
35 import qualified Data.Text as Text
36
37
38 data NgramsListBuilder = BuilderStepO { stemSize :: Int
39 , stemX :: Int
40 , stopSize :: Int
41 }
42 | BuilderStep1 { withModel :: Model }
43 | BuilderStepN { withModel :: Model }
44
45
46
47 data StopSize = StopSize {unStopSize :: Int}
48
49 -- | TODO improve grouping functions of Authors, Sources, Institutes..
50 buildNgramsLists :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
51 -> Cmd err (Map NgramsType [NgramsElement])
52 buildNgramsLists l n m s uCid mCid = do
53 ngTerms <- buildNgramsTermsList l n m s uCid mCid
54 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
55 pure $ Map.unions $ othersTerms <> [ngTerms]
56
57
58 buildNgramsOthersList :: UserCorpusId -> (Text -> Text) -> NgramsType
59 -> Cmd err (Map NgramsType [NgramsElement])
60 buildNgramsOthersList uCid groupIt nt = do
61 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
62
63 let
64 all' = Map.toList ngs
65 pure $ (toElements GraphTerm $ take 10 all') <> (toElements CandidateTerm $ drop 10 all')
66 where
67 toElements nType x = Map.fromList [(nt, [ mkNgramsElement t nType Nothing (mSetFromList [])
68 | (t,_ns) <- x
69 ]
70 )
71 ]
72
73 buildNgramsTermsList :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
74 -> Cmd err (Map NgramsType [NgramsElement])
75 buildNgramsTermsList l n m s uCid mCid = do
76 candidates <- sortTficf <$> getTficf' uCid mCid NgramsTerms (ngramsGroup l n m)
77 let termList = toTermList ((isStopTerm s) . fst) candidates
78 let ngs = List.concat $ map toNgramsElement termList
79
80 pure $ Map.fromList [(NgramsTerms, ngs)]
81
82
83 toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement]
84 toNgramsElement (listType, (_stem, (_score, setNgrams))) =
85 case Set.toList setNgrams of
86 [] -> []
87 (parent:children) -> [parentElem] <> childrenElems
88 where
89 parentElem = mkNgramsElement parent
90 listType
91 Nothing
92 (mSetFromList children)
93 childrenElems = map (\t -> mkNgramsElement t listType
94 (Just $ RootParent parent parent)
95 (mSetFromList [])
96 ) children
97
98 -- TODO remove hard coded parameters
99 toTermList :: (a -> Bool) -> [a] -> [(ListType, a)]
100 toTermList stop ns = map (toTermList' stop CandidateTerm) xs
101 <> map (toTermList' stop GraphTerm) ys
102 <> map (toTermList' stop CandidateTerm) zs
103 where
104 toTermList' stop' l n = case stop' n of
105 True -> (StopTerm, n)
106 False -> (l, n)
107
108 -- TODO use % of size of list
109 -- TODO user ML
110 xs = take a ns
111 ys = take b $ drop a ns
112 zs = drop b $ drop a ns
113
114 a = 3
115 b = 500
116
117 isStopTerm :: StopSize -> Text -> Bool
118 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
119 where
120 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)