From 552c47d28e1b53ba92b16407f4b888593ad2d233 Mon Sep 17 00:00:00 2001
From: Radek Mrvec <radek.mrvec@sumanet.cz>
Date: Tue, 20 Apr 2021 23:17:36 +0200
Subject: [PATCH 1/6] Natazeni souboru dat z json souboru refs #8331 @2h

---
 .gitignore                                    |  5 ++
 Indexer/Indexer.sln                           | 14 ++++-
 Indexer/Indexer/Indexer.csproj                | 16 +++++
 Indexer/Indexer/Preprocessor.cs               | 13 ++++
 Indexer/Indexer/Program.cs                    | 60 +++++++++++++++++++
 Indexer/Models/Models.csproj                  |  7 +++
 .../Models/BlogEntriesRepository.cs           |  0
 .../Models/BlogEntry.cs                       |  0
 .../Models/IRepository.cs                     |  0
 Indexer/WebCrawler/WebCrawler.csproj          |  4 ++
 10 files changed, 118 insertions(+), 1 deletion(-)
 create mode 100644 Indexer/Indexer/Indexer.csproj
 create mode 100644 Indexer/Indexer/Preprocessor.cs
 create mode 100644 Indexer/Indexer/Program.cs
 create mode 100644 Indexer/Models/Models.csproj
 rename Indexer/{WebCrawler => Models}/Models/BlogEntriesRepository.cs (100%)
 rename Indexer/{WebCrawler => Models}/Models/BlogEntry.cs (100%)
 rename Indexer/{WebCrawler => Models}/Models/IRepository.cs (100%)

diff --git a/.gitignore b/.gitignore
index 0df7638..6829651 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,8 @@
 /Indexer/WebCrawler/obj
 /Indexer/WebCrawler/bin/Debug/net5.0
 /.vs
+/Indexer/Indexer/bin/Debug/net5.0
+/Indexer/Indexer/obj/Debug/net5.0
+/Indexer/Indexer/obj
+/Indexer/Models/bin/Debug/net5.0
+/Indexer/Models/obj
diff --git a/Indexer/Indexer.sln b/Indexer/Indexer.sln
index d76b612..e33cac9 100644
--- a/Indexer/Indexer.sln
+++ b/Indexer/Indexer.sln
@@ -3,7 +3,11 @@ Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio Version 16
 VisualStudioVersion = 16.0.31112.23
 MinimumVisualStudioVersion = 10.0.40219.1
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "WebCrawler", "WebCrawler\WebCrawler.csproj", "{D0ED9338-791A-428D-AC37-E41210B6DAF4}"
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "WebCrawler", "WebCrawler\WebCrawler.csproj", "{D0ED9338-791A-428D-AC37-E41210B6DAF4}"
+EndProject
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Indexer", "Indexer\Indexer.csproj", "{B597653B-2773-48B2-BF4A-29D150450AD9}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Models", "Models\Models.csproj", "{A6FA6649-2F36-47D8-9B26-B178E5BB2EEB}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
@@ -15,6 +19,14 @@ Global
 		{D0ED9338-791A-428D-AC37-E41210B6DAF4}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{D0ED9338-791A-428D-AC37-E41210B6DAF4}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{D0ED9338-791A-428D-AC37-E41210B6DAF4}.Release|Any CPU.Build.0 = Release|Any CPU
+		{B597653B-2773-48B2-BF4A-29D150450AD9}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{B597653B-2773-48B2-BF4A-29D150450AD9}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{B597653B-2773-48B2-BF4A-29D150450AD9}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{B597653B-2773-48B2-BF4A-29D150450AD9}.Release|Any CPU.Build.0 = Release|Any CPU
+		{A6FA6649-2F36-47D8-9B26-B178E5BB2EEB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{A6FA6649-2F36-47D8-9B26-B178E5BB2EEB}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{A6FA6649-2F36-47D8-9B26-B178E5BB2EEB}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{A6FA6649-2F36-47D8-9B26-B178E5BB2EEB}.Release|Any CPU.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
diff --git a/Indexer/Indexer/Indexer.csproj b/Indexer/Indexer/Indexer.csproj
new file mode 100644
index 0000000..1060521
--- /dev/null
+++ b/Indexer/Indexer/Indexer.csproj
@@ -0,0 +1,16 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+    <TargetFramework>net5.0</TargetFramework>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\Models\Models.csproj" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <Folder Include="Utils\" />
+  </ItemGroup>
+
+</Project>
diff --git a/Indexer/Indexer/Preprocessor.cs b/Indexer/Indexer/Preprocessor.cs
new file mode 100644
index 0000000..196b376
--- /dev/null
+++ b/Indexer/Indexer/Preprocessor.cs
@@ -0,0 +1,13 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace Indexer
+{
+    public class Preprocessor
+    {
+
+    }
+}
diff --git a/Indexer/Indexer/Program.cs b/Indexer/Indexer/Program.cs
new file mode 100644
index 0000000..922505f
--- /dev/null
+++ b/Indexer/Indexer/Program.cs
@@ -0,0 +1,60 @@
+
+using System;
+using System.IO;
+using System.Text.Json;
+using WebCrawler.Models;
+
+namespace Indexer
+{
+    class Program
+    {
+
+
+        static void Main(string[] args)
+        {
+            new Program().Run(args);
+        }
+
+        public void Run(string[] args)
+        {
+            foreach (string path in args)
+            {
+                if (File.Exists(path))
+                {
+                    // This path is a file
+                    ProcessFile(path);
+                }
+                else if (Directory.Exists(path))
+                {
+                    // This path is a directory
+                    ProcessDirectory(path);
+                }
+                else
+                {
+                    Console.WriteLine("{0} is not a valid file or directory.", path);
+                }
+            }
+        }
+
+        public static void ProcessDirectory(string targetDirectory)
+        {
+            // Process the list of files found in the directory.
+            string[] fileEntries = Directory.GetFiles(targetDirectory);
+            foreach (string fileName in fileEntries)
+                ProcessFile(fileName);
+
+            // Recurse into subdirectories of this directory.
+            string[] subdirectoryEntries = Directory.GetDirectories(targetDirectory);
+            foreach (string subdirectory in subdirectoryEntries)
+                ProcessDirectory(subdirectory);
+        }
+
+        // Insert logic for processing found files here.
+        public static void ProcessFile(string path)
+        {
+            string contents = File.ReadAllText(path);
+            BlogEntry entry = JsonSerializer.Deserialize<BlogEntry>(contents);
+            Console.WriteLine("Processed file '{0}'.", path);
+        }
+    }
+}
diff --git a/Indexer/Models/Models.csproj b/Indexer/Models/Models.csproj
new file mode 100644
index 0000000..563e6f9
--- /dev/null
+++ b/Indexer/Models/Models.csproj
@@ -0,0 +1,7 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFramework>net5.0</TargetFramework>
+  </PropertyGroup>
+
+</Project>
diff --git a/Indexer/WebCrawler/Models/BlogEntriesRepository.cs b/Indexer/Models/Models/BlogEntriesRepository.cs
similarity index 100%
rename from Indexer/WebCrawler/Models/BlogEntriesRepository.cs
rename to Indexer/Models/Models/BlogEntriesRepository.cs
diff --git a/Indexer/WebCrawler/Models/BlogEntry.cs b/Indexer/Models/Models/BlogEntry.cs
similarity index 100%
rename from Indexer/WebCrawler/Models/BlogEntry.cs
rename to Indexer/Models/Models/BlogEntry.cs
diff --git a/Indexer/WebCrawler/Models/IRepository.cs b/Indexer/Models/Models/IRepository.cs
similarity index 100%
rename from Indexer/WebCrawler/Models/IRepository.cs
rename to Indexer/Models/Models/IRepository.cs
diff --git a/Indexer/WebCrawler/WebCrawler.csproj b/Indexer/WebCrawler/WebCrawler.csproj
index 8f52e34..324450c 100644
--- a/Indexer/WebCrawler/WebCrawler.csproj
+++ b/Indexer/WebCrawler/WebCrawler.csproj
@@ -10,4 +10,8 @@
     <PackageReference Include="System.Net.Http" Version="4.3.4" />
   </ItemGroup>
 
+  <ItemGroup>
+    <ProjectReference Include="..\Models\Models.csproj" />
+  </ItemGroup>
+
 </Project>
-- 
GitLab


From afae65953163dbd91b446c8e68d03bd824c1e047 Mon Sep 17 00:00:00 2001
From: Radek Mrvec <radek.mrvec@sumanet.cz>
Date: Thu, 22 Apr 2021 20:19:54 +0200
Subject: [PATCH 2/6] fixed #8698 escaping in JSON file and also added
 downloading of whole HTML page f

---
 .gitignore | 376 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 363 insertions(+), 13 deletions(-)

diff --git a/.gitignore b/.gitignore
index 6829651..7d8da62 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,13 +1,363 @@
-################################################################################
-# This .gitignore file was automatically created by Microsoft(R) Visual Studio.
-################################################################################
-
-/Indexer/.vs/Indexer/v16
-/Indexer/WebCrawler/obj
-/Indexer/WebCrawler/bin/Debug/net5.0
-/.vs
-/Indexer/Indexer/bin/Debug/net5.0
-/Indexer/Indexer/obj/Debug/net5.0
-/Indexer/Indexer/obj
-/Indexer/Models/bin/Debug/net5.0
-/Indexer/Models/obj
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+##
+## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
+
+# User-specific files
+*.rsuser
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+
+# Mono auto generated files
+mono_crash.*
+
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+x64/
+x86/
+[Ww][Ii][Nn]32/
+[Aa][Rr][Mm]/
+[Aa][Rr][Mm]64/
+bld/
+[Bb]in/
+[Oo]bj/
+[Oo]ut/
+[Ll]og/
+[Ll]ogs/
+
+# Visual Studio 2015/2017 cache/options directory
+.vs/
+# Uncomment if you have tasks that create the project's static files in wwwroot
+#wwwroot/
+
+# Visual Studio 2017 auto generated files
+Generated\ Files/
+
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+
+# NUnit
+*.VisualState.xml
+TestResult.xml
+nunit-*.xml
+
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+
+# Benchmark Results
+BenchmarkDotNet.Artifacts/
+
+# .NET Core
+project.lock.json
+project.fragment.lock.json
+artifacts/
+
+# ASP.NET Scaffolding
+ScaffoldingReadMe.txt
+
+# StyleCop
+StyleCopReport.xml
+
+# Files built by Visual Studio
+*_i.c
+*_p.c
+*_h.h
+*.ilk
+*.meta
+*.obj
+*.iobj
+*.pch
+*.pdb
+*.ipdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*_wpftmp.csproj
+*.log
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+
+# Chutzpah Test files
+_Chutzpah*
+
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opendb
+*.opensdf
+*.sdf
+*.cachefile
+*.VC.db
+*.VC.VC.opendb
+
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+*.sap
+
+# Visual Studio Trace Files
+*.e2e
+
+# TFS 2012 Local Workspace
+$tf/
+
+# Guidance Automation Toolkit
+*.gpState
+
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+*.DotSettings.user
+
+# TeamCity is a build add-in
+_TeamCity*
+
+# DotCover is a Code Coverage Tool
+*.dotCover
+
+# AxoCover is a Code Coverage Tool
+.axoCover/*
+!.axoCover/settings.json
+
+# Coverlet is a free, cross platform Code Coverage Tool
+coverage*.json
+coverage*.xml
+coverage*.info
+
+# Visual Studio code coverage results
+*.coverage
+*.coveragexml
+
+# NCrunch
+_NCrunch_*
+.*crunch*.local.xml
+nCrunchTemp_*
+
+# MightyMoose
+*.mm.*
+AutoTest.Net/
+
+# Web workbench (sass)
+.sass-cache/
+
+# Installshield output folder
+[Ee]xpress/
+
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+
+# Click-Once directory
+publish/
+
+# Publish Web Output
+*.[Pp]ublish.xml
+*.azurePubxml
+# Note: Comment the next line if you want to checkin your web deploy settings,
+# but database connection strings (with potential passwords) will be unencrypted
+*.pubxml
+*.publishproj
+
+# Microsoft Azure Web App publish settings. Comment the next line if you want to
+# checkin your Azure Web App publish settings, but sensitive information contained
+# in these scripts will be unencrypted
+PublishScripts/
+
+# NuGet Packages
+*.nupkg
+# NuGet Symbol Packages
+*.snupkg
+# The packages folder can be ignored because of Package Restore
+**/[Pp]ackages/*
+# except build/, which is used as an MSBuild target.
+!**/[Pp]ackages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/[Pp]ackages/repositories.config
+# NuGet v3's project.json files produces more ignorable files
+*.nuget.props
+*.nuget.targets
+
+# Microsoft Azure Build Output
+csx/
+*.build.csdef
+
+# Microsoft Azure Emulator
+ecf/
+rcf/
+
+# Windows Store app package directories and files
+AppPackages/
+BundleArtifacts/
+Package.StoreAssociation.xml
+_pkginfo.txt
+*.appx
+*.appxbundle
+*.appxupload
+
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!?*.[Cc]ache/
+
+# Others
+ClientBin/
+~$*
+*~
+*.dbmdl
+*.dbproj.schemaview
+*.jfm
+*.pfx
+*.publishsettings
+orleans.codegen.cs
+
+# Including strong name files can present a security risk
+# (https://github.com/github/gitignore/pull/2483#issue-259490424)
+#*.snk
+
+# Since there are multiple workflows, uncomment next line to ignore bower_components
+# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
+#bower_components/
+
+# RIA/Silverlight projects
+Generated_Code/
+
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+ServiceFabricBackup/
+*.rptproj.bak
+
+# SQL Server files
+*.mdf
+*.ldf
+*.ndf
+
+# Business Intelligence projects
+*.rdl.data
+*.bim.layout
+*.bim_*.settings
+*.rptproj.rsuser
+*- [Bb]ackup.rdl
+*- [Bb]ackup ([0-9]).rdl
+*- [Bb]ackup ([0-9][0-9]).rdl
+
+# Microsoft Fakes
+FakesAssemblies/
+
+# GhostDoc plugin setting file
+*.GhostDoc.xml
+
+# Node.js Tools for Visual Studio
+.ntvs_analysis.dat
+node_modules/
+
+# Visual Studio 6 build log
+*.plg
+
+# Visual Studio 6 workspace options file
+*.opt
+
+# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
+*.vbw
+
+# Visual Studio LightSwitch build output
+**/*.HTMLClient/GeneratedArtifacts
+**/*.DesktopClient/GeneratedArtifacts
+**/*.DesktopClient/ModelManifest.xml
+**/*.Server/GeneratedArtifacts
+**/*.Server/ModelManifest.xml
+_Pvt_Extensions
+
+# Paket dependency manager
+.paket/paket.exe
+paket-files/
+
+# FAKE - F# Make
+.fake/
+
+# CodeRush personal settings
+.cr/personal
+
+# Python Tools for Visual Studio (PTVS)
+__pycache__/
+*.pyc
+
+# Cake - Uncomment if you are using it
+# tools/**
+# !tools/packages.config
+
+# Tabs Studio
+*.tss
+
+# Telerik's JustMock configuration file
+*.jmconfig
+
+# BizTalk build output
+*.btp.cs
+*.btm.cs
+*.odx.cs
+*.xsd.cs
+
+# OpenCover UI analysis results
+OpenCover/
+
+# Azure Stream Analytics local run output
+ASALocalRun/
+
+# MSBuild Binary and Structured Log
+*.binlog
+
+# NVidia Nsight GPU debugger configuration file
+*.nvuser
+
+# MFractors (Xamarin productivity tool) working folder
+.mfractor/
+
+# Local History for Visual Studio
+.localhistory/
+
+# BeatPulse healthcheck temp database
+healthchecksdb
+
+# Backup folder for Package Reference Convert tool in Visual Studio 2017
+MigrationBackup/
+
+# Ionide (cross platform F# VS Code tools) working folder
+.ionide/
+
+# Fody - auto-generated XML schema
+FodyWeavers.xsd
\ No newline at end of file
-- 
GitLab


From add9dae90cf211ed9057ef9f0ce19d01b23e9f52 Mon Sep 17 00:00:00 2001
From: Radek Mrvec <radek.mrvec@sumanet.cz>
Date: Sat, 24 Apr 2021 22:54:09 +0200
Subject: [PATCH 3/6] refs #8331 finished preprocessor with stemmer.

---
 .gitignore                                    |   2 +-
 .../.vs/Indexer/DesignTimeBuild/.dtbcache.v2  | Bin 60901 -> 78512 bytes
 Indexer/Indexer.sln                           |   6 +-
 Indexer/Indexer/Indexer.Indexer.csproj        |  18 ++++
 Indexer/Indexer/Indexer.csproj                |  20 +---
 Indexer/Indexer/Program.cs                    |  11 ++-
 .../Indexer/Properties/launchSettings.json    |   8 ++
 Indexer/Indexer/Utils/Preprocessor.cs         |  90 ++++++++++++++++++
 .../Enums/IdxLanguage.cs}                     |   7 +-
 .../{Models.csproj => Indexer.Models.csproj}  |   2 +-
 .../Models/Models/BlogEntriesRepository.cs    |   2 +-
 Indexer/Models/Models/TextData.cs             |  13 +++
 Indexer/Models/Models/TextTokens.cs           |  13 +++
 Indexer/Models/{Models => Repos}/BlogEntry.cs |   2 +-
 .../Models/{Models => Repos}/IRepository.cs   |   2 +-
 Indexer/WebCrawler/GamasutraCrawl.cs          |  55 +++++++----
 Indexer/WebCrawler/GamasutraParser.cs         |   4 +-
 ...awler.csproj => Indexer.WebCrawler.csproj} |   5 +-
 Indexer/WebCrawler/Program.cs                 |  28 ++++--
 Indexer/WebCrawler/Utils/HtmlDownloader.cs    |   2 +-
 20 files changed, 232 insertions(+), 58 deletions(-)
 create mode 100644 Indexer/Indexer/Indexer.Indexer.csproj
 create mode 100644 Indexer/Indexer/Properties/launchSettings.json
 create mode 100644 Indexer/Indexer/Utils/Preprocessor.cs
 rename Indexer/{Indexer/Preprocessor.cs => Models/Enums/IdxLanguage.cs} (56%)
 rename Indexer/Models/{Models.csproj => Indexer.Models.csproj} (68%)
 create mode 100644 Indexer/Models/Models/TextData.cs
 create mode 100644 Indexer/Models/Models/TextTokens.cs
 rename Indexer/Models/{Models => Repos}/BlogEntry.cs (86%)
 rename Indexer/Models/{Models => Repos}/IRepository.cs (86%)
 rename Indexer/WebCrawler/{WebCrawler.csproj => Indexer.WebCrawler.csproj} (53%)

diff --git a/.gitignore b/.gitignore
index 7d8da62..43d9deb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -360,4 +360,4 @@ MigrationBackup/
 .ionide/
 
 # Fody - auto-generated XML schema
-FodyWeavers.xsd
\ No newline at end of file
+FodyWeavers.xsd
diff --git a/Indexer/.vs/Indexer/DesignTimeBuild/.dtbcache.v2 b/Indexer/.vs/Indexer/DesignTimeBuild/.dtbcache.v2
index 157bdf0469ee702a9156b45762c5d6311cdcb60d..c912c6df6fdd87ce37466d9e2240b2a99f4a3a71 100644
GIT binary patch
delta 7501
zcmeH~e{2+G8pmg*R74Mo{IDWW1nd<KX?J(JZC4IyEu}0_OA7^o7Tlektqa@TvOlb~
z;FKcAaq7V`aEczMoH51_&odk2dFmPC8Bjvd7-Nj_j4{MhLwM$%7|*EjzVCDwN^xZ`
zx&C!*`h7p&d4J6NzVp8Gyq(h@7X112DEW^mxw*DF7<5+#oq<42mEYs6^!VJWLv`rE
zfV)!FG>=QweXbhKU+Jn0=s}O>*WFbf)urif-J@%6w<opldU;FgsN?6}NI<_^kG1<l
zk@m&9KiSzH(G%`+M|-BA+-B;BYsbwRo^w+{{mt#t^b4}}Sa~cNNrbv|EgA`iI;Fz$
z&GBeN@}jS)nVJ{(B`rz)X?~$yZ2m*8kzHo7(qX4Jm<N>-xgD)5v#q24%|9_x3?L8=
zOQRC8q%Mhe>$B~CdWM;H6En33r=7_x8kQ-IvirDTP8W@8$ehx}qjt)&YTh<5$9%o8
z46b`}X?pw>(~|OfjIg088qmY>iyuq2+re_X&xQ;0Zn|NO9*c*fk%dlk-_PchMv|EO
zjPmmK%q=ghU9QEuW6{m#yuQFysx;bJ?sS%`s(GkyiL9pUw_%Jf^VoI+Y2&@G$Qv;<
zN3Nm8^x(N2GGut0(Z-W$bI}v|(ZQ70N4tCLqg~x$Jz*-wd&Z49+hyB}^}Teh?tovq
z@sevUYL&F=@o4xi9iwQ~(~bRCD1&Pyhs>gZ>r#K2{fqpcLVD{!TZ1yU!nBS}o}3+U
zmSW%z^USg7W3xFn%?Za>$?MJF@tMQ&_a66SE&g@<KBPTwK47oQ>Y`av(aM;Ph1B$T
zJQ_1Ud#h!J<EI~**?MAbeXU=uL6mSgD%~!<%I8vjZVe&A=?>`5V3itlR%xDq8t_;8
z5L^8C)i~W%)xJufrn@RDRn;Sj&AKwr)fID{ke%j`9Ww61JZUNJd3~;XN)3n$M$XdN
zIcGhj!%J2YTVQt1jnWjoTD+X)qShRHStngM1N*hC5UX1HBLzhj316&JPuv_1`D4CV
zue46A^RS0trjE}u54<(gn9!<sN8|0chNGPgkwmN)FRV0aFwbivl!y-IR$OZ8b@8~~
z<q!9IBf+S+O|9;5)l|AP4^9Kk6?E%0T8+P2^97wXs^->%j+&r4!(Uykx;3{Jbh~_-
z!{HA215TGaSm~(pc-+#~)X^1JX6Kmtx@63`3FBhs4cU^ly1zc=>j@7(3e0u#BeFkr
zbb)zRx#qebsV%k)%2zDTQFd*P{Bzsve9zJf(a))Z4x=pRgF0W^Uc$U8id|5abFU1!
zqo{aOMKo#hv5C3ns)|jMtFS}tlXFko+<n|_u79gcEpAJMa8xW!bcGj3dm`beFM#Ka
z^;L7u(_D69YP#=4sdCHcm5BbjmXM~2pt~Zf`NFGw2|XMRCG^aG3F&dExK$78F+HN`
z&B;V}GLb1VUp`rDK7O*q{O;s7`8K?^*%d!`5M<Y9@R$*6;3&zwyxF46i<zmh8mnYv
zi-tVC!Q!#f;BOqDE0Pq1I-xtE5VRTE0)?S2C;~;HZfGkMgT|w692D&n(8c#KE`5_J
zxJeR??}B=uyP;l4-0&W#584K8hwg<8s2|z^4M01g`=DKr804>@-O#V0J)-h&p!=ce
znT{TS7o9x_4eju^xE7seJIlADy=b!!N<sS}yJJxFYlx1EG98=nqJsmF=tPWuc;h|D
z%MSP;{O=$c?L`NI4}k@sXe(wb>Wk}#GuMZ}D{%b?h`w5-M`1+e-$SDE7!-<u<QhbF
z0H`7zM)C(pbnuwSz^^?$@b=#)87XVEyqN8u)h5#fYmGd3g%m~SJX>q!{qjERcBCn5
zo!lTwto4HxNMUP(%vQT>b=c(_<t5f8*}4N44_iLD(elfd1_WeF2ZFNI3EU}LMhM(2
zTU&szY;^$<*@^<)vb7b>_ggV}DI>wq;=;v+OQ2+*m9*2lkbd_Fm3Wy-*y@qJ*4;9I
z!P6|{&0;DAN}0-lGN!q}T&8)zJT_=PIG?Eks9<seP9_ylncRSzsT!ze@&F#D1;7HP
zg}_3lMZhAaI-riJ9;jz(02-JYfkq}TfWPHJOie%&Q!~)aWVC=SOsjxZOl?3L(^_CH
z(>h=s(*|GzQwPw&v<cY6<O6(68lW-hfX>tjbTWm25YrZ53sV=+#S{giOk07iOmQI2
zlr+F3QxDL?)C=@7^#OfM+kx#EB49A>0Cq6#1a>m*0(LR&26i*;0roK658Ti6An+j5
zUSKa%3P>@TfXQ?aILP!6@DS4>FK&E@=}}xh%5)ew%=8%W7}F8p2-6e56HHG6Pcl6P
zJjL`h@HEp=;3(5Gz%xwG0?#r%2Rz4=2GUH=1J5(P0KCBTBJd*9OTbG^uXwTkuQ0ue
z%U79R172f#9eADT4d4x?W56+{H-R^qP5>vEP68*HP64Nw-T~fWdKY+?={?{*ruTvO
znLYqMVEPdFkm)1fBc_jmkC{FJj8B+81wUmv4V-5B4ET)cbKrBPFM%(a&H!ha&H`td
zEWl!7fSJAmzGC_s_?qb(;2WlIfp3|<2fk<e0r-I}S@w#AtpZ^RU<wW43gJczGa6=$
zFk@h@7UpW0vBHdnDH5g#<{DwHfte`GM3~9KOok~IrWj_5FjHWr3NsaEx-ipWW(YF_
zW~OYw&4imRvS!1S3R4PGCQKR3Tw&(I%oAoF%zR<y!&C@U0pk?L38M<5!nlQT!&D1X
z4dW5U1G7Mw1uzR`>_4PzEfQIaVCsaagQ*v$9;QK<2AD=+8ezP`cwv?cvmB;Lm?oHJ
zVVYrDglU24MRZD88|7t;#JUGLeIkHuvqRT*`In+}gS`BFEM@CnlpUanE0AZ%BJH=+
z9d<fkH{6NzyT7K-e(_IbZGZpcnUMasX_7?J0Li3Kfpv@W?+H{W?!QR66<w~DVXRsO
zDP3vRDRixMt76q7bR%O5T~AYKJWWGfrRli&!|74)ES_YX5BmH5+tB{Mi0p<KNgiQ*
zq@uWyXBcn63)V6MQQQdS7sk<jawMWS!gF3>e59hdk!KhmsVHvb8OBE{iX%Mc6~;xh
z8Hp%v<Qc}%{fI<yBhN5CQc=7u&oD0j8@0nWqO6G6ixrVJ*y$2G9TK-+G|ztFOwakU
zTN%@+)LKg+uX*ImL}M9T6;ut?K+B;OP!qHOx(!;W)b{ru$pri!d<vZp_|2Jszu#J^
z)InB@f)ug7RiUe_Rf@G5u^t&?X&e>Nc&p7m7S<>*)>@<!MBUrPxv);L)}wBll0Nlz
z&*g7~kvd)SJX|h+BV7JQxcrUq|ND)wFxNMNwZ=YLHrQ#qopuaAY+lAk!qZCOMjRcR
d%)@7j=idQ0^pNr4+7I0fX^=Q*0+4;s{0EVYm+}Ar

delta 2721
zcmai!4NTN!9LMiDK+{U|g?uS6Ul<YN<=_rHnE?U<0TCgQatL?a9|R5#xg#Rc!_2kj
zTHsSF=bCG+HP%|$t-03JxoXXP3$d~0T6501)>^-3zd3Wget)OP=CW@4{yv}Q|GfPl
z{_OeqO8k|JacXEx)fVMB);ikra@<yq<@D$IvVHB&{5;F)bhuqUcb?bfvMg_QZoWUq
znUm+U{EqDQ+-$$s=XSTX``0?0YsW^Ht4{5mDA<2~fo}aaStUuG6<4y(V_OPcs<*0p
zRQ2?T#S_)~hFYy0I?3J~tB>e|_GGmM1F9cvw0tG~-ob#?pXm<-Z1Lg#0m~*`vU<)P
zz4Pv9&z))8;$!n=*J{0ZVx1m6zc9KkzE@$#@q}8`6SGpH$+I3F(0}b{o9nR6c4Rsn
znYj+#J-ji;VN0wZ3Wcqn%qOjIW<@yMm*vdN&2-ulZeso&y|x^^Yd9$}baPHG=4`_p
zPi%1Xli@wGYm)sD-E^uzkL_A>yR}B&clrT6H`1vKM<zWhg1#<KspT8!@L&m7X0~VM
zJh9XK=YT%H&$hzX+57J+M0f69ZjYV)QPrg{jW_7p*>j`&UZT>}HCe%d@VfuoJ6#_>
zn{?0g3+aiI^{PRio?NM~O}g~RH?G*3mg=+L1a6;h#^js}^<x)O7cQs>_XJ9VgS~;E
z*N>0f7Sh${R^=qs1pQVZ<k=7ic0A?nutHB;eZf#?IM_dgA-2R@-IX(sb^7VL<>_~Q
zYklx^TJ+F`6uaIqm8|2Z9@ni?!#Dp4Bh!b7ZQs`)?7Gi(^EVGDw%cqQkP@U6DML0Q
z<wylmi9CT+A=O9?@+9&A=GKBT|0(3|Yeeq8$TBQahsjSP^~ffq0g(ltK^l=~k<G{!
zqzUmL&B#`y1!+aRh`gi^X+!*og|s6b5tRA2HrfW4#XAvMSQhC*x{(0VgWR47??r-$
z!h9Jc>;vOKnJb&hcxi9H)$Rvpp&bHcuevab3?4va@Eml?i`?xcre6R<Y|o(#BC_%G
z@&LcqX#B-57e_|TCbeIUng-PU<{4GW_?bo}V6wA$R<YTv%$8d{O*1_n^fW886*I?7
ziz+j%%6QRu%lOnr)256cu#{;BI+WQ4bSl#YbSo18dX$Ouf<a~afbGik$KE2O%Grbz
zEi_@Vu-E{4M$L1vI*9tm0ZOi5N@iLGtYS(5QkYVKRHif_jVT>SXUYIFn6iK@CI{eP
z$^~*GOfJyHln>-HxdAs*0Z_oS9$3#*2oy3E1I0`wKnYVBP{vdalrvQVl}uGY6;lmR
z!&D2@GSvZfO!YuLQv=Yz)Ce?kgf@elnVNtmre>g-sRd|Z@&aC_HlU5k0xYHupo6Is
z=w#{!x|w=_9;P4=WZDjFX9@u!rU77pX%HA>8Uluxb^tq=B0IsIAb`KcZeTal9$*jC
zUSKcN%fQP_uK=$w?F05Py$Za_^cwIQ(-<(uqyf$J2Ji;co4}h)2Y>@i2Z4i3Zv$^L
z9Rdz9y(9a+!}KojF4KF!dra>G?=y`9<4hj_A259ge8}_>@DbBt;4o7Zh%$W&e9CkL
zIKuQ9@EOzRz~@Xyful^vf#XakfD=q#0ADbDRe>{q#q>3rUo)KqPBNVWPBEPU&M=(=
z&M{2@6HJr9B-3}mcT7{j6w~*>_e>Xoi%dTNKQR3W{K#|(xWsfBxXg3~xWaT5xXScX
z1-}1JOh2RfGt)1?FHFAzzcLxXFfqVP*MMtG*MaLyH-H;VzXQKB{R#ZZ^f&M~8>K3w
zu!$3igCvL~KxT`~hRhL}1GzUMb}#IHk^3R@Mdm{mh%A6C5?KUE6iI|E5m^FRCbA6j
zkjO)jB#|V@3Xv6%l_D!4t3_5rQbbZ9sS4j8l}(yFN`s_}q(d@9G9XzZSrCVa1ClF}
z3vr3KAo(Ks5VwdMQXo<QSue64QYcagDHbV)lqh_ER5oSus0>mrQVyvUsf1LCR6%M)
zY9O^DwU9cII>__5kmL%iWE3-mCojmIwIg<04WsVIHTuNNy~@TDYc?q-Zdv@yP7HdT
x7F9uaDXEcI9f{T5v2|WV{o|O~ql%6k%TnWZz3R8ZdFF9@ktsxq>_sCZ{{UtD+!O!+

diff --git a/Indexer/Indexer.sln b/Indexer/Indexer.sln
index e33cac9..1973e44 100644
--- a/Indexer/Indexer.sln
+++ b/Indexer/Indexer.sln
@@ -3,11 +3,11 @@ Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio Version 16
 VisualStudioVersion = 16.0.31112.23
 MinimumVisualStudioVersion = 10.0.40219.1
-Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "WebCrawler", "WebCrawler\WebCrawler.csproj", "{D0ED9338-791A-428D-AC37-E41210B6DAF4}"
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Indexer.WebCrawler", "WebCrawler\Indexer.WebCrawler.csproj", "{D0ED9338-791A-428D-AC37-E41210B6DAF4}"
 EndProject
-Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Indexer", "Indexer\Indexer.csproj", "{B597653B-2773-48B2-BF4A-29D150450AD9}"
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Indexer.Indexer", "Indexer\Indexer.Indexer.csproj", "{B597653B-2773-48B2-BF4A-29D150450AD9}"
 EndProject
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Models", "Models\Models.csproj", "{A6FA6649-2F36-47D8-9B26-B178E5BB2EEB}"
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Indexer.Models", "Models\Indexer.Models.csproj", "{A6FA6649-2F36-47D8-9B26-B178E5BB2EEB}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
diff --git a/Indexer/Indexer/Indexer.Indexer.csproj b/Indexer/Indexer/Indexer.Indexer.csproj
new file mode 100644
index 0000000..4b89db9
--- /dev/null
+++ b/Indexer/Indexer/Indexer.Indexer.csproj
@@ -0,0 +1,18 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+    <TargetFramework>net5.0</TargetFramework>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\Models\Indexer.Models.csproj" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <PackageReference Include="HtmlAgilityPack" Version="1.11.33" />
+    <PackageReference Include="Microsoft.ML" Version="1.5.5" />
+    <PackageReference Include="StemmersNet.Standard" Version="1.1.1" />
+  </ItemGroup>
+
+</Project>
diff --git a/Indexer/Indexer/Indexer.csproj b/Indexer/Indexer/Indexer.csproj
index 1060521..0f14913 100644
--- a/Indexer/Indexer/Indexer.csproj
+++ b/Indexer/Indexer/Indexer.csproj
@@ -1,16 +1,4 @@
-<Project Sdk="Microsoft.NET.Sdk">
-
-  <PropertyGroup>
-    <OutputType>Exe</OutputType>
-    <TargetFramework>net5.0</TargetFramework>
-  </PropertyGroup>
-
-  <ItemGroup>
-    <ProjectReference Include="..\Models\Models.csproj" />
-  </ItemGroup>
-
-  <ItemGroup>
-    <Folder Include="Utils\" />
-  </ItemGroup>
-
-</Project>
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup />
+</Project>
\ No newline at end of file
diff --git a/Indexer/Indexer/Program.cs b/Indexer/Indexer/Program.cs
index 922505f..4e11e51 100644
--- a/Indexer/Indexer/Program.cs
+++ b/Indexer/Indexer/Program.cs
@@ -1,10 +1,11 @@
 
+using Indexer.Models;
 using System;
+using System.Collections.Generic;
 using System.IO;
 using System.Text.Json;
-using WebCrawler.Models;
 
-namespace Indexer
+namespace Indexer.Indexer
 {
     class Program
     {
@@ -54,7 +55,11 @@ namespace Indexer
         {
             string contents = File.ReadAllText(path);
             BlogEntry entry = JsonSerializer.Deserialize<BlogEntry>(contents);
-            Console.WriteLine("Processed file '{0}'.", path);
+
+            Preprocessor processor = new();
+            HashSet<string> tokens = processor.Process(entry);
+
+            Console.WriteLine("Processed file '{0}'. Number of tokens {1}", path, tokens.Count);
         }
     }
 }
diff --git a/Indexer/Indexer/Properties/launchSettings.json b/Indexer/Indexer/Properties/launchSettings.json
new file mode 100644
index 0000000..2d62df7
--- /dev/null
+++ b/Indexer/Indexer/Properties/launchSettings.json
@@ -0,0 +1,8 @@
+{
+  "profiles": {
+    "Indexer.Indexer": {
+      "commandName": "Project",
+      "commandLineArgs": "downloaded_json"
+    }
+  }
+}
\ No newline at end of file
diff --git a/Indexer/Indexer/Utils/Preprocessor.cs b/Indexer/Indexer/Utils/Preprocessor.cs
new file mode 100644
index 0000000..3ccff14
--- /dev/null
+++ b/Indexer/Indexer/Utils/Preprocessor.cs
@@ -0,0 +1,90 @@
+using Indexer.Models;
+using Indexer.Models.Enums;
+using Indexer.Models.Models;
+using Iveonik.Stemmers;
+using Microsoft.ML;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Text.RegularExpressions;
+using System.Threading.Tasks;
+using System.Web;
+
+namespace Indexer.Indexer
+{
+    public class Preprocessor
+    {
+
+        public HashSet<string> Process(BlogEntry entry)
+        {
+            string content = RemoveTags(entry.Content);
+            content  = StripUnicodeCharactersFromString(content);
+            string[] tokens = CreateTokens(content, IdxLanguage.English);
+            return StemmTokens(new EnglishStemmer(), tokens);
+        }
+
+        /// <summary>
+        /// Removing HTML tags from given string.
+        /// </summary>
+        /// <param name="input"></param>
+        /// <returns>String without HTML tags</returns>
+        private string RemoveTags(string input)
+        {
+            if (string.IsNullOrEmpty(input)) return null;
+
+            HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
+            doc.LoadHtml(input);
+
+            return HttpUtility.HtmlDecode(doc.DocumentNode.InnerText); ;
+        }
+
+        /// <summary>
+        /// Removing unicode codes and escaped chars from given string
+        /// </summary>
+        /// <param name="input"></param>
+        /// <returns>String without unicode and escaped chars</returns>
+        private string StripUnicodeCharactersFromString(string input)
+        {
+            string noUnicode = Regex.Replace(input, @"[\u0000-\u001F\u0080-\u009f\u0100-\uFFFF]", String.Empty);
+            return Regex.Replace(noUnicode, @"\t|\n|\r", "");
+        }
+
+
+        /// <summary>
+        /// Creating tokens and and removing default stop words based on language.
+        /// </summary>
+        /// <param name="input"></param>
+        /// <returns></returns>
+        private string[] CreateTokens(string input, IdxLanguage lang)
+        {
+            var context = new MLContext();
+            var emptyData = new List<TextData>();
+            var data = context.Data.LoadFromEnumerable(emptyData);
+
+            var tokenization = context.Transforms.Text.TokenizeIntoWords("Tokens", "Text", separators: new[] { ' ', '.', ',',')','(', '[', ']'})
+                .Append(context.Transforms.Text.RemoveDefaultStopWords("Tokens", "Tokens",
+                    Microsoft.ML.Transforms.Text.StopWordsRemovingEstimator.Language.English));
+
+            var stopWordsModel = tokenization.Fit(data);
+            var engine = context.Model.CreatePredictionEngine<TextData, TextTokens>(stopWordsModel);
+            return engine.Predict(new TextData { Text = input }).Tokens;
+        }
+
+        /// <summary>
+        /// Going through all the tokens and using stemmer on them.
+        /// </summary>
+        /// <param name="stemmer"></param>
+        /// <param name="words"></param>
+        /// <returns></returns>
+        private HashSet<string> StemmTokens(IStemmer stemmer, params string[] words)
+        {
+            HashSet<string> stems = new HashSet<string>();
+            foreach (string word in words)
+            {
+                stems.Add(stemmer.Stem(word));
+            }
+            return stems;
+        }
+    }
+}
diff --git a/Indexer/Indexer/Preprocessor.cs b/Indexer/Models/Enums/IdxLanguage.cs
similarity index 56%
rename from Indexer/Indexer/Preprocessor.cs
rename to Indexer/Models/Enums/IdxLanguage.cs
index 196b376..b91ee3a 100644
--- a/Indexer/Indexer/Preprocessor.cs
+++ b/Indexer/Models/Enums/IdxLanguage.cs
@@ -4,10 +4,11 @@ using System.Linq;
 using System.Text;
 using System.Threading.Tasks;
 
-namespace Indexer
+namespace Indexer.Models.Enums
 {
-    public class Preprocessor
+    public enum IdxLanguage
     {
-
+        English,
+        Czech
     }
 }
diff --git a/Indexer/Models/Models.csproj b/Indexer/Models/Indexer.Models.csproj
similarity index 68%
rename from Indexer/Models/Models.csproj
rename to Indexer/Models/Indexer.Models.csproj
index 563e6f9..6e9ceba 100644
--- a/Indexer/Models/Models.csproj
+++ b/Indexer/Models/Indexer.Models.csproj
@@ -1,4 +1,4 @@
-<Project Sdk="Microsoft.NET.Sdk">
+<Project Sdk="Microsoft.NET.Sdk">
 
   <PropertyGroup>
     <TargetFramework>net5.0</TargetFramework>
diff --git a/Indexer/Models/Models/BlogEntriesRepository.cs b/Indexer/Models/Models/BlogEntriesRepository.cs
index b940fbd..c5eb131 100644
--- a/Indexer/Models/Models/BlogEntriesRepository.cs
+++ b/Indexer/Models/Models/BlogEntriesRepository.cs
@@ -4,7 +4,7 @@ using System.Linq;
 using System.Text;
 using System.Threading.Tasks;
 
-namespace WebCrawler.Models
+namespace Indexer.Models
 {
     public class BlogEntriesRepository : IRepository
     {
diff --git a/Indexer/Models/Models/TextData.cs b/Indexer/Models/Models/TextData.cs
new file mode 100644
index 0000000..06ca52e
--- /dev/null
+++ b/Indexer/Models/Models/TextData.cs
@@ -0,0 +1,13 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace Indexer.Models
+{
+    public class TextData
+    {
+        public string Text { get; set; }
+    }
+}
diff --git a/Indexer/Models/Models/TextTokens.cs b/Indexer/Models/Models/TextTokens.cs
new file mode 100644
index 0000000..03886be
--- /dev/null
+++ b/Indexer/Models/Models/TextTokens.cs
@@ -0,0 +1,13 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace Indexer.Models.Models
+{
+    public class TextTokens
+    {
+        public string[] Tokens { get; set; }
+    }
+}
diff --git a/Indexer/Models/Models/BlogEntry.cs b/Indexer/Models/Repos/BlogEntry.cs
similarity index 86%
rename from Indexer/Models/Models/BlogEntry.cs
rename to Indexer/Models/Repos/BlogEntry.cs
index e5f0c7c..4326c59 100644
--- a/Indexer/Models/Models/BlogEntry.cs
+++ b/Indexer/Models/Repos/BlogEntry.cs
@@ -4,7 +4,7 @@ using System.Linq;
 using System.Text;
 using System.Threading.Tasks;
 
-namespace WebCrawler.Models
+namespace Indexer.Models
 {
     public class BlogEntry
     {
diff --git a/Indexer/Models/Models/IRepository.cs b/Indexer/Models/Repos/IRepository.cs
similarity index 86%
rename from Indexer/Models/Models/IRepository.cs
rename to Indexer/Models/Repos/IRepository.cs
index 476f4e7..0a7efec 100644
--- a/Indexer/Models/Models/IRepository.cs
+++ b/Indexer/Models/Repos/IRepository.cs
@@ -4,7 +4,7 @@ using System.Linq;
 using System.Text;
 using System.Threading.Tasks;
 
-namespace WebCrawler.Models
+namespace Indexer.Models
 {
     public interface IRepository
     {
diff --git a/Indexer/WebCrawler/GamasutraCrawl.cs b/Indexer/WebCrawler/GamasutraCrawl.cs
index 2b55032..8784b43 100644
--- a/Indexer/WebCrawler/GamasutraCrawl.cs
+++ b/Indexer/WebCrawler/GamasutraCrawl.cs
@@ -1,14 +1,16 @@
-using System;
+using Indexer.Models;
+using System;
 using System.Collections.Generic;
 using System.IO;
 using System.Linq;
 using System.Text;
+using System.Text.Encodings.Web;
 using System.Text.Json;
+using System.Text.RegularExpressions;
 using System.Threading;
 using System.Threading.Tasks;
-using WebCrawler.Models;
 
-namespace WebCrawler
+namespace Indexer.WebCrawler
 {
     class GamasutraCrawl
     {
@@ -22,12 +24,12 @@ namespace WebCrawler
 
         private readonly GamasutraParser parser = new GamasutraParser();
 
-        public void Start()
+        public void Start(int maxPages)
         {
             
             Console.WriteLine("Downloading HTML page: " + url + urlSection);
-            GetLinks("/member/page=");
-            GetLinks("/expert/page=");
+            GetLinks("/member/page=", maxPages);
+            GetLinks("/expert/page=", maxPages);
 
             blogEntries.PrintRepo();
 
@@ -35,7 +37,7 @@ namespace WebCrawler
         }
 
 
-        private void GetLinks(string typeSection)
+        private void GetLinks(string typeSection, int maxPages)
         {
             int pageNum = 1;
             string html = null;
@@ -55,29 +57,50 @@ namespace WebCrawler
                     Console.WriteLine("No more pages.");
                     html = null;
                 }
-            } while (html != null && pageNum < 10);
+            } while (html != null && pageNum < maxPages);
         }
 
 
         private void ProcessEntries()
         {
-            foreach(var item in blogEntries.BlogEntryLinks)
+            foreach (var item in blogEntries.BlogEntryLinks)
             {
-                BlogEntry entry = new BlogEntry { url = item.Remove(0, 7)};
+                BlogEntry entry = new BlogEntry { url = item.Remove(0, 7).Replace("\t","") };
 
                 StringBuilder fileName = new StringBuilder();
+
                 fileName.Append(entry.url.Replace("/", "-"));
-                fileName.Append(".json");
 
-                if (!File.Exists("downloaded/" + fileName))
+                string html;
+                if (!File.Exists("downloaded_html/" + fileName))
                 {
                     Console.WriteLine("Downloading entry HTML page: " + url + item);
+                    html = download.GetHtml(url + item);
+                    File.WriteAllText("downloaded_html/" + fileName, html);
+                }
+                else
+                {
+                    html = File.ReadAllText("downloaded_html/" + fileName);
+                }
+
+                parser.GetEntry(html, entry);
+
+                fileName.Clear();
+                fileName.Append(entry.url.Replace("/", "-"));
+                fileName.Append(".json");
+
+                if (!File.Exists("downloaded_json/" + fileName))
+                {
+                    
+                    JsonSerializerOptions options = new JsonSerializerOptions
+                    {
+                        Encoder = JavaScriptEncoder.UnsafeRelaxedJsonEscaping,
+                        WriteIndented = true
+                    };
 
-                    string html = download.GetHtml(url + item);
-                    parser.GetEntry(html, entry);
-                    string json = JsonSerializer.Serialize(entry);
+                    string json = JsonSerializer.Serialize(entry, options);
 
-                    File.WriteAllText("downloaded/" + fileName, json);
+                    File.WriteAllText("downloaded_json/" + fileName, json);
                     Console.WriteLine("File " + fileName + " saved");
 
                     Thread.Sleep(500);
diff --git a/Indexer/WebCrawler/GamasutraParser.cs b/Indexer/WebCrawler/GamasutraParser.cs
index 95d8302..8e26700 100644
--- a/Indexer/WebCrawler/GamasutraParser.cs
+++ b/Indexer/WebCrawler/GamasutraParser.cs
@@ -1,12 +1,12 @@
 using HtmlAgilityPack;
+using Indexer.Models;
 using System;
 using System.Collections.Generic;
 using System.Linq;
 using System.Text;
 using System.Threading.Tasks;
-using WebCrawler.Models;
 
-namespace WebCrawler
+namespace Indexer.WebCrawler
 {
     public class GamasutraParser
     {
diff --git a/Indexer/WebCrawler/WebCrawler.csproj b/Indexer/WebCrawler/Indexer.WebCrawler.csproj
similarity index 53%
rename from Indexer/WebCrawler/WebCrawler.csproj
rename to Indexer/WebCrawler/Indexer.WebCrawler.csproj
index 324450c..fca20f7 100644
--- a/Indexer/WebCrawler/WebCrawler.csproj
+++ b/Indexer/WebCrawler/Indexer.WebCrawler.csproj
@@ -6,12 +6,11 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="HtmlAgilityPack" Version="1.11.31" />
-    <PackageReference Include="System.Net.Http" Version="4.3.4" />
+    <PackageReference Include="HtmlAgilityPack" Version="1.11.33" />
   </ItemGroup>
 
   <ItemGroup>
-    <ProjectReference Include="..\Models\Models.csproj" />
+    <ProjectReference Include="..\Models\Indexer.Models.csproj" />
   </ItemGroup>
 
 </Project>
diff --git a/Indexer/WebCrawler/Program.cs b/Indexer/WebCrawler/Program.cs
index ac915eb..8763a82 100644
--- a/Indexer/WebCrawler/Program.cs
+++ b/Indexer/WebCrawler/Program.cs
@@ -3,20 +3,36 @@ using System;
 using System.Net.Http;
 using System.Threading.Tasks;
 
-namespace WebCrawler
+namespace Indexer.WebCrawler
 {
     class Program
-    {
-        
-        
+    {      
         static void Main(string[] args)
         {
             Console.WriteLine("KIV/IR WebCrawler starting");
 
-            System.IO.Directory.CreateDirectory("downloaded");
+            System.IO.Directory.CreateDirectory("downloaded_json");
+            System.IO.Directory.CreateDirectory("downloaded_html");
 
             GamasutraCrawl crawler = new GamasutraCrawl();
-            crawler.Start();
+
+            int maxPages = 10;
+
+            if (args.Length > 0) {
+
+                try {
+                    maxPages = Int32.Parse(args[0]);
+                }
+                catch (FormatException)
+                {
+                    Console.WriteLine($"Unable to parse '{args[0]}'.");
+                }
+
+            }
+
+            Console.WriteLine($"Going to process {maxPages} pages.");
+            
+            crawler.Start(maxPages);
         }
     }
 }
diff --git a/Indexer/WebCrawler/Utils/HtmlDownloader.cs b/Indexer/WebCrawler/Utils/HtmlDownloader.cs
index 44594d3..04442ae 100644
--- a/Indexer/WebCrawler/Utils/HtmlDownloader.cs
+++ b/Indexer/WebCrawler/Utils/HtmlDownloader.cs
@@ -6,7 +6,7 @@ using System.Net.Http;
 using System.Text;
 using System.Threading.Tasks;
 
-namespace WebCrawler
+namespace Indexer.WebCrawler
 {
     public class HtmlDownloader
     {
-- 
GitLab


From 131543086301299946c11cd56b3b4255607249bc Mon Sep 17 00:00:00 2001
From: Radek Mrvec <radek.mrvec@sumanet.cz>
Date: Sun, 25 Apr 2021 21:02:14 +0200
Subject: [PATCH 4/6] refs #8332 Added Dependency injection and main Indexer
 clasess and interfaces

---
 .../.vs/Indexer/DesignTimeBuild/.dtbcache.v2  | Bin 78512 -> 83069 bytes
 Indexer/Indexer/Application.cs                |  70 ++++++++++++++++++
 Indexer/Indexer/Indexer.Indexer.csproj        |   1 +
 Indexer/Indexer/Program.cs                    |  57 +++-----------
 Indexer/Indexer/Services/DictionaryService.cs |  12 +++
 .../Indexer/Services/IDictionaryService.cs    |  13 ++++
 Indexer/Indexer/Services/IIndexerService.cs   |  13 ++++
 Indexer/Indexer/Services/IndexerService.cs    |  14 ++++
 Indexer/Models/Indexer.Models.csproj          |   4 +
 Indexer/Models/Models/Dictionary.cs           |  18 +++++
 Indexer/Models/Models/Index.cs                |  13 ++++
 11 files changed, 169 insertions(+), 46 deletions(-)
 create mode 100644 Indexer/Indexer/Application.cs
 create mode 100644 Indexer/Indexer/Services/DictionaryService.cs
 create mode 100644 Indexer/Indexer/Services/IDictionaryService.cs
 create mode 100644 Indexer/Indexer/Services/IIndexerService.cs
 create mode 100644 Indexer/Indexer/Services/IndexerService.cs
 create mode 100644 Indexer/Models/Models/Dictionary.cs
 create mode 100644 Indexer/Models/Models/Index.cs

diff --git a/Indexer/.vs/Indexer/DesignTimeBuild/.dtbcache.v2 b/Indexer/.vs/Indexer/DesignTimeBuild/.dtbcache.v2
index c912c6df6fdd87ce37466d9e2240b2a99f4a3a71..fc9a77633dc400b98032d23164eb9536cd66fe9a 100644
GIT binary patch
delta 6932
zcmeHJdvH|M9o};>0s@Kw45k8+%5*4n+4sYi67n!4B!rkicr1atckhM``w06WK#=7r
zD2hogvWRaj)<;{FwS`t&YPI$yqJL{UZD;(u_sq_8=+xSN=iI%?-Am}`GE>{Bcjr6L
z@BV(jbI$L6dtUkYx_=z4ARi2q_S$e*bo+gFyT>N_Lq3-;>~r{iqT3tvNj|U7>vnkj
zL6_vV`5i8|+v}1<pXl_-L65`f$D6YB>QLr-vQF8%XU_UqL`o)7iE!E)iw5jgx78hp
zMuLI3oKB_1ct}i!99CPPZIsp4YOPNsWosxJwRBxEytrnqoJ>U$@uha<{V9GDWIn=1
zA6BV96BE;0OMOW&Dfag}t#*^E{!AOl+*^04H*Lo3u<S)Oh-q=0MX_PC51B2huVhgn
z&~`-Rfznl7bqAT%wV-o{>q^$8Pnn%|GN<iEr=QUmJE(S}3$x`Avz#r%^2#<@jHl|O
znN(U%M&iAEHCv3XtY(VAx7Egei)-pL(R3!M{9(!>(#<02E+dlF^~<^Q&JyPsHTdKC
zSoT^wiyuQ*QjElr;^VB<OfmQ#XRSSKt#TP_MFjCwIFXELF>EMV^4O%oJ6pLjT+@<o
zpV&2lY+#O8#pQu?BF?}2zEmO}u(Q)}nm>IzhE|O+Tfo%z*)*uEnB=N-+Qg76EPH|u
z(Id)o$ih9fAUR?&>=+cqs(%njUHr!L)>&k)0-I%Sonb&ZGw4&u_73H?BRA%SDHDm7
zFwLo5%!gG<gyc{n7I0ZzP^_dJHj7xN#;XUfAra%|J4(isEel<W>bW^TdFqS(U#~Ro
z>=G`GmI|$!YMinxVLRG(<xyIy4Z&OW^nfM&m?c)t9FJo0GK<x0Rpakm{_LKQClX0{
z^Wbdd*q-T2uk>x++pCLsOt;cxv{JrIUsF^}{rJK3>ru6oM0G7X+oPNG8BKl;6;0>{
zn#(X?RzkkYpl<P7WmwcjZ#1m>GK}b1sq32Z`oXy@rOL52CO&*|Y2{BZtWnx0&dpCb
z{2C#$zSd>erafbJuKs<Pv!Cgh@k3+EvJK${ASJtTNhy9ucb2at%Kqa$${(jq&HwYr
zyoPx<tWG9+lOi-JWtp`m9f_s_tCMnnG9k&SR3fQ(-#uO@`hy<3<aIdwQqbXZ%3h~k
z4m!Mkd&ut!`h22fbBEos7z#RNk0gs>(dCiClFjKAoubWe3qxMT4{XY(7iZ_&FMdj9
z&1{TkVyQrLXlJV!@6Cw4GQ2CTm%h574T6UVy5yZ{{(T{+nTD~h#5Ose%J*H`J~?|r
zuvi`jP69dL6z~Xe8h8|V40s%P0(cU53V0frZn0RNLCxNO7AVKYtrknx@*TW<26ztm
zF7Q0S488!o2)qP*5BNUt1K?%g72s9iERY9Y16Yth1bzgZ1I`0K27Us}L9oov>j3lh
z2B7)NY7T#j0p?V9_E{bM4DVb3eh$0|m>u6#bo>j<G5c=;%m)jezneuvf7uxE+Zg{P
zKwyA5VD&qwC!l68v(+*_>;J0Q|269C(f<u<=Ipls<Ngj{+{u_@ff$3Z8dsnP0FLEd
z6c+*J;5{~gr?N(=_^^kZ$vr=dbWFOvz#etM$y-RZ-uYVYc0+HWV*A_Z)FP#0tn&?}
z_M_36C5re_b!*X_og3vPj63TK>O9mg)mc!B3@2ys5;N{1!=t)K8SdPqDsn-OX?gCw
zs<8;`bAPEK0e!AHU;Fp-H)k~)w2RPAQc&+;4z!2$`9s%`B55r_yOe~!M(Wl{>j>>O
zk<iyj>vhrwLf3OrR+~Y|1~k@2ofRN-qsbKd8flYG>LGNKiG;pJ+N_hd5W3k!LLU$O
zQ&DFH3EHLWLtir`>7)>$l8J=AMv`?>m{8e7LSG~G>ZGlN_GV42UY!-uS$%{?OeFL*
zQ`^W08KzOvQ0&KuT1n%@S&U&yn_(qQfIUR}Q7UvhY2;(sq{dK%0!@(uO%s|aPVE?*
z8Nkd=LU$Esc8|>rVrCDa_wYdWlDlY@(0%BfqWeh`JwWI|REG#XjOt!OkDxkA=zXZ}
zC-fL;x|=>g=!57UC-fmyLxc|F^$|KkRuoWRJql|ntaSo2L-b)@o<#Y{bL!mYf;zWA
zbEKJ`BB-$9bJ=NR@Vaa#X)drFM#`drJVjcffm}qUp@DQmcA<fkLI$CMv_Y1jfz&`E
zpn=oJVWWW)#(|=NQ>4ADfrG-ipn=1nk06*~x|g&RP|(vDc{I!devF6uI7$V(yAo$m
z-0lJ$;N9XL7w91G7B{v)pTH|8)ETXK<4MBGr+E1^FQ4I7pGEn}JLfJ`l7)gY|MIdd
z{kBj`mjSiFa-a^V2O5Az;4YvESOGKxEx<~k6=(xi0qwwQpaZxY=mffeHNaY+8(0Ud
z2Q~m3fdH^c$kx&xG&TcU01*fR5)cApAPn>ZTY-pByKmnmU0v}nrCJWCL>1Htv`@HP
zsaCKZ-6qsw7o!9X8Wm8glW0s(Z=rF4CXlfhxK*uIzoJf8Z&O~ryeK!Tikwl-|Gw_r
z=Go+>FU1Xs^S>=_Lk7_r`rInhHYzjMq%x<}`B0hpxy6gff~k#YHWt(-R84s+RVRGW
zLUU)S&<ttXfscmCM!&$y?YvC#(jXph<RBh)ayB<-2C1rei`3G$^Vz@tnf&LKBn3f8
z0~z27CE0;Nqmm4uzZ2L6>;?vbJpfaddjPI1?-!NjLW#2MEh@_h%?ir_x=%m}QQ0r3
zH_!tDJqS_BE*8{j>W%76>U4UD3(#Q!4SFvt!9BtR=%_&N1NV@ibbQ$TrR-?84ED!~
z=l}CgHT>`HRAt|f$`P(R*e!|Qk4}$wKl-m$v*uDY8#Z5?_{E4G=2CV)FOTu^fpNuc
sX|84lnfZlE+k;S~<A8Qa`ex}R=^;!QrELiPVSrthMgS&h3cw}pUlZ|h#{d8T

delta 3249
zcmeH|-%k`*6vyX`3SvYoZJJ;zHf=2?Z7sVqv#@Pi7X=ly6<S0^7Tukl#UCKJfC!?y
zBKTX}-9Z)+@kif$YP%1ljcNQBY|@uDzS)N&P43OSG>uLAy~6+t=>z@s#!Wuw+<VWt
z_nga~!~Ah+*_S^rr3VSx?JEg~H8t!Cg-Sg^uS@j?G^JQ6Ho_rIRdn6!R*ZnVR1d0d
zHDrXndeG23Ud62&n&CBcP4j*g@5rJ?v+VUslO+n%&vQLX%$}}l^Vh*u=_mQwl<s@w
zVQc#L{%6_dv-`DamiV5`_9uSWyUp=Ye{y(bS{+(fUXWK6F<K)ny3y9w64|J?Nh_)%
zEsc>tvl;nuz!y|Xk(awz)!c?B;8p^fj-*^#$Z&-{O4#Mmy&)wOR0DWT5Tn$kc}fCm
zKsVf~swm#H@2BmuIrj98^u*JqoTwp7QW!LX!{7)w3XXv$&<u`)7SIY#fCyMENm3iM
zINuKb$sGlf6qPz~_$25Ar$84F7j}cwpa+}*XF(Liz&X$h&VxSC4@8m|z(sHgTn1Ob
zRj?Mxia6JRh;<!I#fwe_9KePMIvs4Gfo|YT93(&z%nzLB1U6w4_YVRQLnLpX6@jJ?
zXJ&j8`$K?mfe0XU3VI2&I4oW({Kb0MS>J-r!g>T+1iKA{?<f$yIk1UDGLzVt`GV;L
z;3M6E7y}}}U9o{tzt)WX(vW($h$^)8TP(KDve?^fy~W<LW(Jn_U`39@239oVVAf5n
z$#$I0o{afe#{BWbnrP;Q?>iT|oD1E|HFKdbVQCMhI1VaP<~k^jgXTDRm^Rlza~w(>
z2h;!j;Y^-+le0OyChK!EbE_P$-auc!6irfs*HHP?HGB`Td|pdaD@d4T<`&;eNs9A*
zkO{t@DujjCO*t^x;s?m$^~C+Qy@9s!gTxPEG0X$BjR%SAP$A+5RG4@p)L}pIBhW{Q
zAA@Ql-VAk|cnef3@e??n<Pq9#K_Im@F>4dEc39%P!<Hu@UyQKAN{ba*ypt;VDT3lP
z;{`-LOQT(|CwK$xupoFh(mXA$=&=u<v5g04=j>xFeioa9?B!k9i;|ErTb{FJuYK-3
z<cn{4ANhDcxBxDKOW-oN0<MB<<g4e`F&O|iKpZ4M5}05R+yp})1%|;bFamCaQE&&0
zfxF-yxDOtHFUXgA{3f|lkIHD}_;?n*BX5bt?m0#O-D{G>q+Ui$W=r_T@^ATUnfOC&
zk~ib^HG~QG$&hS0FO%7;e6!5A;CI}V`XPrri+v*Mo6Yt~Jfr^Of3M2_eUUGZbGyoi
z$JY>@UO-95r)n5KvnbzkS;z`oZnb4b5r4CgBA!#j$*m~9|Gao7DY{Me@$H}zxPTid
zKm|L12b6$PuoLV8pMVd+r{E*m7mM9@>c0~o&)NFl?bLsgSIK37SIdwn{m*2!f`2aa
z8rgbJ?BudlERU_`d+au;l`-Lag@o@u(I)$4UWYc>lPV~peT!`}uT2&|C-@ipSDzDJ
zYd78jTh`msKfe_pEua<szB#@4zkW|9sj>lWbr2MT`7L<}dzmd6z&Z#%06GwD83J}&
F{t3-O-8ujO

diff --git a/Indexer/Indexer/Application.cs b/Indexer/Indexer/Application.cs
new file mode 100644
index 0000000..c2db838
--- /dev/null
+++ b/Indexer/Indexer/Application.cs
@@ -0,0 +1,70 @@
+using Indexer.Indexer.Services;
+using Indexer.Indexer.Utils;
+using Indexer.Models;
+using Microsoft.Extensions.DependencyInjection;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using System.Text.Json;
+using System.Threading.Tasks;
+
+namespace Indexer.Indexer
+{
+    public class Application
+    {
+
+        private readonly IIndexerService _indexer;
+        private readonly IDictionaryService _dictionary;
+
+        public Application(IIndexerService indexer, IDictionaryService dictionary) => (_indexer, _dictionary) = (indexer, dictionary);
+
+        public void Run(string[] args)
+        {
+            foreach (string path in args)
+            {
+                if (File.Exists(path))
+                {
+                    // This path is a file
+                    ProcessFile(path);
+                }
+                else if (Directory.Exists(path))
+                {
+                    // This path is a directory
+                    ProcessDirectory(path);
+                }
+                else
+                {
+                    Console.WriteLine("{0} is not a valid file or directory.", path);
+                }
+            }
+        }
+
+        public static void ProcessDirectory(string targetDirectory)
+        {
+            // Process the list of files found in the directory.
+            string[] fileEntries = Directory.GetFiles(targetDirectory);
+            foreach (string fileName in fileEntries)
+                ProcessFile(fileName);
+
+            // Recurse into subdirectories of this directory.
+            string[] subdirectoryEntries = Directory.GetDirectories(targetDirectory);
+            foreach (string subdirectory in subdirectoryEntries)
+                ProcessDirectory(subdirectory);
+        }
+
+        // Insert logic for processing found files here.
+        public static void ProcessFile(string path)
+        {
+            string contents = File.ReadAllText(path);
+            BlogEntry entry = JsonSerializer.Deserialize<BlogEntry>(contents);
+
+            Preprocessor processor = new();
+            HashSet<string> tokens = processor.Process(entry);
+
+            Console.WriteLine("Processed file '{0}'. Number of tokens {1}", path, tokens.Count);
+        }
+    }
+}
+
diff --git a/Indexer/Indexer/Indexer.Indexer.csproj b/Indexer/Indexer/Indexer.Indexer.csproj
index 4b89db9..5d248e9 100644
--- a/Indexer/Indexer/Indexer.Indexer.csproj
+++ b/Indexer/Indexer/Indexer.Indexer.csproj
@@ -11,6 +11,7 @@
 
   <ItemGroup>
     <PackageReference Include="HtmlAgilityPack" Version="1.11.33" />
+    <PackageReference Include="Microsoft.Extensions.DependencyInjection" Version="5.0.1" />
     <PackageReference Include="Microsoft.ML" Version="1.5.5" />
     <PackageReference Include="StemmersNet.Standard" Version="1.1.1" />
   </ItemGroup>
diff --git a/Indexer/Indexer/Program.cs b/Indexer/Indexer/Program.cs
index 4e11e51..f8aa3ce 100644
--- a/Indexer/Indexer/Program.cs
+++ b/Indexer/Indexer/Program.cs
@@ -1,5 +1,8 @@
 
+using Indexer.Indexer.Services;
+using Indexer.Indexer.Utils;
 using Indexer.Models;
+using Microsoft.Extensions.DependencyInjection;
 using System;
 using System.Collections.Generic;
 using System.IO;
@@ -10,56 +13,18 @@ namespace Indexer.Indexer
     class Program
     {
 
-
         static void Main(string[] args)
         {
-            new Program().Run(args);
-        }
-
-        public void Run(string[] args)
-        {
-            foreach (string path in args)
-            {
-                if (File.Exists(path))
-                {
-                    // This path is a file
-                    ProcessFile(path);
-                }
-                else if (Directory.Exists(path))
-                {
-                    // This path is a directory
-                    ProcessDirectory(path);
-                }
-                else
-                {
-                    Console.WriteLine("{0} is not a valid file or directory.", path);
-                }
-            }
-        }
+            IServiceCollection serviceCollection = new ServiceCollection();
+            serviceCollection.AddScoped<Application>();
+            serviceCollection.AddSingleton<IIndexerService, IndexerService>();
+            serviceCollection.AddSingleton<IDictionaryService, DictionaryService>();
 
-        public static void ProcessDirectory(string targetDirectory)
-        {
-            // Process the list of files found in the directory.
-            string[] fileEntries = Directory.GetFiles(targetDirectory);
-            foreach (string fileName in fileEntries)
-                ProcessFile(fileName);
+            var serviceProvider = serviceCollection.BuildServiceProvider();
 
-            // Recurse into subdirectories of this directory.
-            string[] subdirectoryEntries = Directory.GetDirectories(targetDirectory);
-            foreach (string subdirectory in subdirectoryEntries)
-                ProcessDirectory(subdirectory);
+            var app = serviceProvider.GetService<Application>();
+            app.Run(args);
         }
 
-        // Insert logic for processing found files here.
-        public static void ProcessFile(string path)
-        {
-            string contents = File.ReadAllText(path);
-            BlogEntry entry = JsonSerializer.Deserialize<BlogEntry>(contents);
-
-            Preprocessor processor = new();
-            HashSet<string> tokens = processor.Process(entry);
-
-            Console.WriteLine("Processed file '{0}'. Number of tokens {1}", path, tokens.Count);
-        }
-    }
+    }  
 }
diff --git a/Indexer/Indexer/Services/DictionaryService.cs b/Indexer/Indexer/Services/DictionaryService.cs
new file mode 100644
index 0000000..f4f3b16
--- /dev/null
+++ b/Indexer/Indexer/Services/DictionaryService.cs
@@ -0,0 +1,12 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace Indexer.Indexer.Services
+{
+    public class DictionaryService : IDictionaryService
+    {
+    }
+}
diff --git a/Indexer/Indexer/Services/IDictionaryService.cs b/Indexer/Indexer/Services/IDictionaryService.cs
new file mode 100644
index 0000000..c0123fb
--- /dev/null
+++ b/Indexer/Indexer/Services/IDictionaryService.cs
@@ -0,0 +1,13 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace Indexer.Indexer.Services
+{
+    public interface IDictionaryService
+    {
+
+    }
+}
diff --git a/Indexer/Indexer/Services/IIndexerService.cs b/Indexer/Indexer/Services/IIndexerService.cs
new file mode 100644
index 0000000..292a712
--- /dev/null
+++ b/Indexer/Indexer/Services/IIndexerService.cs
@@ -0,0 +1,13 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace Indexer.Indexer.Services
+{
+    public interface IIndexerService
+    {
+
+    }
+}
diff --git a/Indexer/Indexer/Services/IndexerService.cs b/Indexer/Indexer/Services/IndexerService.cs
new file mode 100644
index 0000000..1bb88f9
--- /dev/null
+++ b/Indexer/Indexer/Services/IndexerService.cs
@@ -0,0 +1,14 @@
+using Indexer.Indexer.Services;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace Indexer.Indexer.Utils
+{
+    public class IndexerService : IIndexerService
+    {
+
+    }
+}
diff --git a/Indexer/Models/Indexer.Models.csproj b/Indexer/Models/Indexer.Models.csproj
index 6e9ceba..67d4c19 100644
--- a/Indexer/Models/Indexer.Models.csproj
+++ b/Indexer/Models/Indexer.Models.csproj
@@ -4,4 +4,8 @@
     <TargetFramework>net5.0</TargetFramework>
   </PropertyGroup>
 
+  <ItemGroup>
+    <PackageReference Include="System.Collections.NonGeneric" Version="4.3.0" />
+  </ItemGroup>
+
 </Project>
diff --git a/Indexer/Models/Models/Dictionary.cs b/Indexer/Models/Models/Dictionary.cs
new file mode 100644
index 0000000..0fe3276
--- /dev/null
+++ b/Indexer/Models/Models/Dictionary.cs
@@ -0,0 +1,18 @@
+using System;
+using System.Collections;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace Indexer.Models.Models
+{
+    public class Dictionary
+    {
+        private Hashtable content;
+
+        public Dictionary() { content = new Hashtable(); }
+
+        public Hashtable Content { get; }
+
+    }
+}
diff --git a/Indexer/Models/Models/Index.cs b/Indexer/Models/Models/Index.cs
new file mode 100644
index 0000000..f429c9f
--- /dev/null
+++ b/Indexer/Models/Models/Index.cs
@@ -0,0 +1,13 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace Indexer.Models.Models
+{
+    public class Index
+    {
+
+    }
+}
-- 
GitLab


From fa4af9abb7c6c93ac2634970fcf4f1c4a3457dc7 Mon Sep 17 00:00:00 2001
From: Radek Mrvec <radek.mrvec@sumanet.cz>
Date: Mon, 26 Apr 2021 22:47:33 +0200
Subject: [PATCH 5/6] refs #8332 Dictionary created during preprocess @1

---
 .../.vs/Indexer/DesignTimeBuild/.dtbcache.v2  | Bin 83069 -> 84517 bytes
 Indexer/Indexer/Application.cs                |  57 +++++++----------
 Indexer/Indexer/Services/DictionaryService.cs |  14 +++++
 .../Indexer/Services/IDictionaryService.cs    |   2 +
 Indexer/Indexer/Utils/FileProcessor.cs        |  58 ++++++++++++++++++
 Indexer/Indexer/Utils/Preprocessor.cs         |  14 ++++-
 6 files changed, 108 insertions(+), 37 deletions(-)
 create mode 100644 Indexer/Indexer/Utils/FileProcessor.cs

diff --git a/Indexer/.vs/Indexer/DesignTimeBuild/.dtbcache.v2 b/Indexer/.vs/Indexer/DesignTimeBuild/.dtbcache.v2
index fc9a77633dc400b98032d23164eb9536cd66fe9a..a73f8f09a5c27f79e97a662a60f47790b4b5a34c 100644
GIT binary patch
delta 5551
zcmeHJdvH|M9o}=3Km@`oP)Ic4ZPE^G?%ns|P!ryfXCy=(8{qEVyBG+&VRxY*=mrFZ
z@GzSb2q5y%Qmr$cvF_N|N;S1FTZ$E01yLm6gF)!@p6)-jwKM(B-MgC&qY@eIU%fNm
z`F-d7zVo=}p5O1>|Ln*)(V9a(iII8bUbiAkK1Fn^R@LE@txlIBxg4tOcl&Ij52b8%
z`z5F9@>&#0wA*}&Q}rpbY;(BW7Kg<eYc3>fX?f>zns+Ime)d60cGi@M9y;RE7`mMF
zrjsunq@R%v;-znO&ZhG&=2Ee9Ku&ffv`(ep?~LVSo0a-{T5)L-?R&X4J~8{0KvvR<
zk(E@7700jVIkM>HjaBq-t^MO?2KOfMZ9|@Ki2rN*@@yKrJdW<UToXUll%GXEI$jgk
zVrL1BoT-KQA=-Ypw&g&-xNZOD3NoJl^jtBYmJYgNr+MdRlUi!N_+8p?#U6j@{1HM=
zTq()zGd~nu8Isq~H?9=X^%pAW$IxE8FucOzblL1apUvy{OJ3FIwfZfhtoXc=;&i)I
zuS4<5Zb@=ECCTm;T|U)nQ5}AV*NN>BeOA%58hazsk=eQQC>f-0KrkzpmXdrrjEtq_
zWCW?qP%NWUIwwF~U?}?$Plh+u-;(ZANdJ*FQ#e6d`jtrPhKL#n*98M%v!XVr0Ywe?
zHr54JslEsr&9XNf3CTJSd+cV>EYdA~i&sqR>NPdpt1{iIBBK}oRHhAc>U^PKIOva<
zd(@S740Y$rir4LQTU{2nqR6U4G(E~!9R2DQ>x+~;^>toPK#gF<o*qxl7rOeE>fy%L
zj2~H}hWN%!(znGd(W)V%2j0A9x;FRzcunsKz3ACN#qoOwSqZJ`TTGkx-9x1z3%urA
zPG`(k#wU?XIZ0F6({83&*s-xa*)cj{`)L|&8cjDf%@<xERE$|}?`&9l^nsHnN<SNW
z^~o0XWRbV?S?N#q%JGSNS7g!G>6^$X$99+M+yBD3TCzkBX&zd;&VMi3zoMa`zRoA>
z2j3Sq6)jLh>+5`K*fX_GU#}e6Xf&DOE>62EztDeYm%kEWwLuMKbS~_2rhKFH$@s`a
z7e?yp!=Cve75N48hl3&BEYilNQS{vl11rj{HmhQD$_~-#mHl?fW%o;p=(PG&(I&ZU
zqUcn;Hm~S*`0a`)JFKcA*=;UWl_i@?bjzY6UjAM_p=?Ng!Q5bAh8j>qb-ps2*=iO|
zIZ0E*z5TkprUgi_!)b$%?VP)Ah8meJ*VlVx-|FeXdPNO+!W+X8b&c5<tglaIj<7ip
z46IC!?9qpeSG^w~V}{HLDk@&CYs>hn4KP~ujiRqy+!-(Ya8eejMjAt)TrChxbJH2E
z`4gp2o+<hCH_;ncEkj=tOs1a!1JT+J`(xl``uD44(UhkEo}Z)j7r-mPN#K{j%fPF^
zYd{C^E8unD*T8Rp-vYk_eh<6>`~i3o_#<!%_!Dp%coX<DP-M*W7C7dLn$Cb^PIwk|
zK51&Qo=)@^^!Y3BHt-H`^TY#;iO<2u$3G8r2+`!~Z|<DA;&&mu01ynoC*bxy*eqE7
z=KO8Bp4Z<u>K9>qq5c6ZpX?IAwVeRh-T@z9$ayyk{-)fSxro#P&}_Pl;tIeg_>ec?
zQHp5ewPkJNOUN|)xG4#3L?WTxt&K})V-x8IRsGRQ3_+<@k`~Y<pBDBu#B_58Ka#;a
zJ}r!94p?RkC>aA<G$}nGccH33n$-~4G;4Q()ezVXfkU%*7uXGf(-6?w>(8gxIp9Xu
znxz|_yv(&#wpW#s7Y9XKNQ_mJsmWp3LZS_3H6&R>30ImY*aM`6G_yr0V{9>*#yxCF
zQlJS5wv;58hp<}Rzl=<0%L!Y7$^j;m8O%$V4@MzOh4B-%66V2L!s=jG5w;qpp0G7A
z0m6bX4TP;l_ZAi+GZQGVv@mC3&LZ$Mvvqp89_8nUwBpKyR-9lzAeC$bfnn9{1%zzQ
zj&6iM#+H%$5-8Y1Smq`^#Adzo!@6@Rnbq|&3HAsYTebYzXp0hFHtOXTz1*tz!_Ncx
z{3P2(%Gq|{G2n4v2e1=p0(JqrNx6sZ0oe;Y0W<?Kpaq~nE3gmP53~UXfP=sx;7Q;x
z@Dy+aI0_sCjss5v&ye!Ajxj`PJ6S>cwYT>oRw3Hh_>pnw-x+B#X{N0j(F84vl?(qI
ztmW|StPm;yHd#Qa<+3S))`v|M*fe}XG~{XhwE@~dHeHyEJ(?kau$jDsv{KN%!|oH<
zEJ)E%pcQI&Y6G>q@F}z+Y{#=&F}-}PBFbh9$@9qO2;d2Alzxgh<!2^oK<1`o^91m;
z43x`~24sFpRwaO^WuRP^G$8k<WYGlzDrp@k*Ch=|bxOBT08h(6xh!cwYErTX1n{&B
zl*^I^WKl}CSOC{$Q7!=QGN>;}>6QxM>ES@RENMVIDOs%mo|b`fS<-+kOUafC;At5s
z9zSo=AGRW;lLhd!4ipb>(I4#_&xzgg;DqEX%$r(_PXJFZ1{4o((I2LyWU2t3mVt6v
z(ja-387WWH%DU%7o_6^s&~BA?nLjntN&!4Q6DXf4X)w-hq-TskXI_8pe~^k>zakYs
zC?u0~U2<!Al3oR@7FfOES<~f-2H*+`tif=t?Q(_S2@78@vw!Wx9{sY+{@fR2cA||&
zwja%NNu{1pkS#!vGLR2IkQ5N`Ac%T|Gzj7vK?{P&MR0;3LJ@Z$2tGvCFfhayVh02P
zgy;ZSA+(JoWNaypCu%sOIDjDhT$W)4IC>I{#7_&F{X%;cAya#B<+S3u>~`g>5_r}4
zQVAB&tH$TS?Wb3bPh#9pNY4Cfis<D!y<D%CTgfzjIqEmm>7LwBUvfdU?Yf&3`W_a_
z*&{#{Xau$ZUvzP8Mf;;b>H^yaep{E=4HDgszK;PSAORM@3fO?h0SDj&T)+<C9$+WX
z1ndHK1ABl;z+T{9p}ev2W8+@@_g$Iy;uFTb7-P-)jaN-r^Du$M1k`cmweTyC3ak}h
zzPat~JBZ`|Kae~-YF$5&&E0+s_ZjJ9e^>fw!`>bEmeR-QTmBdxXfGsWQx8Hi#@h5K
zGgh98vTFUO@1P!8hxGDEBg)c2_G%A;Y#~l0e%VsLf#1sS-wm?(Fi!hZz!Bi)$T^C(
e%*Z*0`f=cC;2GeDzzN_-z_S34p638Pdj1P`x@qSC

delta 4884
zcmeH}dvH|M9mn@~cS8b%Jczc1k(kmT#%kC{_GPuRAwsemCCEGE!S3DN5C};~Kv0k+
zKp;M%n;dWv<)y`H?X*L8+GvZS)!LHg(fWsFtfo_R)bRoLc>n1icIfxqdv}xUbez%7
zR6F&~e17Nr&iS5m@1Aq=%{g^2<DWel^yfZW;)_HAPOryib6Er4kf+QO@z}kdfYTlH
z$R4-H?X<hR!7|xt_1epvPIs9c@B|#5aL{FUcrh|8_r+$@Ms|6xl^q?K#ZGi(v6{EC
z*vPPZin-#(I@28X;_#fQri{`Joo%fqzagB>+D58bu5^a#jFv2>jLgflZ0Xn<HYMAV
z(`B>H(Q7g-rJ<IV)C@wO&ss;^qD;t^At_P*`mQO|#HioTin<FE@l2OFarXRDq6W0p
zuq6j_64y`9B-Z;zAv=EI2dw$cWeG?2PukgrzYZnd-q~oOV(}5I?yc&?Kv#j8y);&r
zs2W>mW*6AXL{azaL}B*w*tP8K^FL&{7n<0o`~N~gtp(H3Hg>vaJ&R@MBz!%s<x<H4
z_V;sFvc}OXSjIbcmOofcwX9*VL|dNbcdkpE89YXX(`q`}nmPiTO65+|l`FP1w{+Ik
zbcEYG+T?I&XB#^?w3yl7wX>^+zf<lFx@@xBZuiPTyT=iBJ8a>g-R-r7yk$X;Cm>s$
z5ob6M3Od3rIUI-t%3N|pwmRGaN5JZ}MohI>X*U~t_f6*epeV8Zy=r0~z29Bo3tR1;
zGP~@JcxBm+DX{NBw;YJrJPz3zj#xbrN6_tdIAy0CaoX`7t1}b~+3e0pnbqa>I!&9k
zExbrqVfPDIJ<VX3ho`VPG@puCA=peuuuS5FT%_qN(|m0z(?=XKTUiI)#4;`QG#@)K
zixru3S-9&?>3V$)qk{$P{5dbHA8cf`L-Seh&@6Ho!s#q~I4{xu!Odp6BXyK+`Zy$+
zmTT$QN5#<@CjH4laZoau4uMBN92^FZf+OHD@HjXMj)CLg1UL!uF!l+QV*Hcf3(Rp%
zYLZ!)<VP5M3OogV44wvJ!e<!&c!6KM`j_BWU;sP^o(I1MbBxiyL0!yn227mg1>|C;
zXTeVyeRB2G)NHK$lY)};`Erf<ert?-#Ta{9sG}3#{3?2W2M8n18s%#!&B#xq6z>sw
zk^kPv|A2A|@^dJ~R}BK8{UZ?Cspu1LPQ4ZN7lD}nCCT)?)K5ol8bTNb;sqn30nf}L
z_Qu5qIu#$trV6(GQcl))Rq-hESns9T$z(qJ*?3aIM#q!yGi5xvf#oZyF$>~9&!tpm
zi?@}IXO4JmLgtL0E~T@{ca_I4=Zz~X;zjvXm+bQ={>Gonk5;J4)vC&u6JJKEQmlQj
z7DTRBR*)fDNqj|`2)Qm=l@zTezA8<GTo<iLifW0k(L_<b2T?TWs<lZ~9r3kkQz6$y
z^+{0!@%l6oa$VGz6s;rPm?lDw2k+;Bq$)_LPhJG%`c!gK6e2FCiID4}a8eW@9!?V>
z*F{Z9(R$)d(KJ<4Qq`PPZ6My9CPJ=HwUPR%kGIe*M!t!ZnY`8LMH{;G9uavP)G^+U
z$oOVjq_ssmbcG_RypvSEh4@yZ>#p&hZRpuf{BEP?p7EX?=-Em9UhPA6(PADYz8lG5
z-bIys5AnSy_7U$!v7h(>6g|Z6LvcUx2dHv2e~@@D(uasYj3P$755xO;KUJv+_&rS2
zn5Z2@PmCYZ;v<NkJ*5=-Ri#knaq{!SgaW@<nTxi$@zG)`9yn&9rIzS2@~h%F<Mg6}
zBa0J?3XUZX9V$3^IB2Ngbm1_ef)j)Tg9=Uw&IBqr2-tU2u-VvKRIs7ENmQ_B*dkQ0
zFZ@w_S0CR+x2g#I2wEPCh;M&f`|6{J3=4f5w!>I?m2cBhW4TqnLraY{R{1duIj9s=
zW5jV1@q`vnYViqe)+Z4^du`yvbyOn7)#+3j<vz*B%fUiW0d4_{z+zAds(>Hd3T^|{
z;C65aSORLmQgA0&29|>rU?o@uR)aO57OVwzpdK`UMybWe*P#*sK_G(=2!jY{0_#Du
z<lDV_H0hrBZ=#s?D5OY=nQxH(F;mG9OSw^6h~;Y`RCtSosATa?lJYIyD)Ba)+KutX
z8^{%3oCo7QwSX25WLfFiZ+s00<$v4PViWu-_Js}R_LRf?M+S4)Fq7vkG%tG*d}WcU
zRHCRHxNITC{8c;4jgy@vrke%+ScFfsOCoO8VuuzdxW$<`+`>k`h*K4PJL47d@mB|G
zrqPG=-?NTRIKURL6^vU)RJV@1&^XaLwjtjR?gsaO9bhLAR&p=UtmIFIm7JYqCA$nO
z>E}_Y9Pr%|B3z_PQl{}e65k6KLBk9sN6A%+^Y}i^ak?c`_<j+gJs=#XN8<NEyH8?s
zFRl8<ag3{l_#cVq|MMo*_jxy|FMTWeGM-f|?nc_J=*Z{XioT{s=AUe2ed$*u?GnVh
zH5a>Iiw|h=!GG&(H^(hibj4TWYP~R{hrr|ukwbU2htV<7)ndr|fVdd-1L0;2Xm0i|
DCb6f>

diff --git a/Indexer/Indexer/Application.cs b/Indexer/Indexer/Application.cs
index c2db838..1e80d09 100644
--- a/Indexer/Indexer/Application.cs
+++ b/Indexer/Indexer/Application.cs
@@ -18,52 +18,39 @@ namespace Indexer.Indexer
         private readonly IIndexerService _indexer;
         private readonly IDictionaryService _dictionary;
 
+        private List<string> files = new();
+
         public Application(IIndexerService indexer, IDictionaryService dictionary) => (_indexer, _dictionary) = (indexer, dictionary);
 
         public void Run(string[] args)
         {
-            foreach (string path in args)
-            {
-                if (File.Exists(path))
-                {
-                    // This path is a file
-                    ProcessFile(path);
-                }
-                else if (Directory.Exists(path))
-                {
-                    // This path is a directory
-                    ProcessDirectory(path);
-                }
-                else
-                {
-                    Console.WriteLine("{0} is not a valid file or directory.", path);
-                }
-            }
-        }
+            FileProcessor fileProcessor = new();
+            List<string> files = fileProcessor.GetFiles(args);
 
-        public static void ProcessDirectory(string targetDirectory)
-        {
-            // Process the list of files found in the directory.
-            string[] fileEntries = Directory.GetFiles(targetDirectory);
-            foreach (string fileName in fileEntries)
-                ProcessFile(fileName);
+            Console.WriteLine($"Going to preprocess {files.Count} files.");
 
-            // Recurse into subdirectories of this directory.
-            string[] subdirectoryEntries = Directory.GetDirectories(targetDirectory);
-            foreach (string subdirectory in subdirectoryEntries)
-                ProcessDirectory(subdirectory);
+            PreprocessFiles(files);
         }
 
-        // Insert logic for processing found files here.
-        public static void ProcessFile(string path)
+        private void PreprocessFiles(List<string> files)
         {
-            string contents = File.ReadAllText(path);
-            BlogEntry entry = JsonSerializer.Deserialize<BlogEntry>(contents);
+            int tokensCount = 0;
+            foreach (var item in files)
+            {
+                string content = File.ReadAllText(item);
 
-            Preprocessor processor = new();
-            HashSet<string> tokens = processor.Process(entry);
+                HashSet<string> tokens = GetTokens(content);
+                tokensCount += tokens.Count;
+            }
+            Console.WriteLine($"All files has been preprocessed and {tokensCount} tokens created.");
+            Console.WriteLine($"Dictionary with {_dictionary.GetLastId()} words created.");
+        }
 
-            Console.WriteLine("Processed file '{0}'. Number of tokens {1}", path, tokens.Count);
+        private HashSet<string> GetTokens(string content)
+        {
+            BlogEntry entry = JsonSerializer.Deserialize<BlogEntry>(content);
+            Preprocessor processor = new(_dictionary);
+            return processor.Process(entry);
         }
     }
 }
diff --git a/Indexer/Indexer/Services/DictionaryService.cs b/Indexer/Indexer/Services/DictionaryService.cs
index f4f3b16..17ff9cc 100644
--- a/Indexer/Indexer/Services/DictionaryService.cs
+++ b/Indexer/Indexer/Services/DictionaryService.cs
@@ -8,5 +8,19 @@ namespace Indexer.Indexer.Services
 {
     public class DictionaryService : IDictionaryService
     {
+        private Dictionary<string, int> idxDictionary = new();
+
+        public void AddToDictionary(string key, int value)
+        {
+            if (!idxDictionary.ContainsKey(key))
+            {
+                idxDictionary.Add(key, value);
+            }
+        }
+
+        public int GetLastId()
+        {
+            return idxDictionary.Count;
+        }
     }
 }
diff --git a/Indexer/Indexer/Services/IDictionaryService.cs b/Indexer/Indexer/Services/IDictionaryService.cs
index c0123fb..ddd4d0c 100644
--- a/Indexer/Indexer/Services/IDictionaryService.cs
+++ b/Indexer/Indexer/Services/IDictionaryService.cs
@@ -8,6 +8,8 @@ namespace Indexer.Indexer.Services
 {
     public interface IDictionaryService
     {
+        public void AddToDictionary(string key, int value);
 
+        public int GetLastId();
     }
 }
diff --git a/Indexer/Indexer/Utils/FileProcessor.cs b/Indexer/Indexer/Utils/FileProcessor.cs
new file mode 100644
index 0000000..50d6994
--- /dev/null
+++ b/Indexer/Indexer/Utils/FileProcessor.cs
@@ -0,0 +1,58 @@
+using Indexer.Models;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using System.Text.Json;
+using System.Threading.Tasks;
+
+namespace Indexer.Indexer.Utils
+{
+    public class FileProcessor
+    {
+
+        private readonly List<string> files = new();
+
+        public List<string> GetFiles(string[] args)
+        {
+            foreach (string path in args)
+            {
+                if (File.Exists(path))
+                {
+                    // This path is a file
+                    CollectFile(path);
+                }
+                else if (Directory.Exists(path))
+                {
+                    // This path is a directory
+                    ProcessDirectory(path);
+                }
+                else
+                {
+                    Console.WriteLine("{0} is not a valid file or directory.", path);
+                }
+            }
+            return files;
+        }
+        
+        
+        private void ProcessDirectory(string targetDirectory)
+        {
+            // Process the list of files found in the directory.
+            string[] fileEntries = Directory.GetFiles(targetDirectory);
+            foreach (string fileName in fileEntries)
+                CollectFile(fileName);
+
+            // Recurse into subdirectories of this directory.
+            string[] subdirectoryEntries = Directory.GetDirectories(targetDirectory);
+            foreach (string subdirectory in subdirectoryEntries)
+                ProcessDirectory(subdirectory);
+        }
+
+        private void CollectFile(string path)
+        {
+            files.Add(path);
+        }
+    }
+}
diff --git a/Indexer/Indexer/Utils/Preprocessor.cs b/Indexer/Indexer/Utils/Preprocessor.cs
index 3ccff14..7ea737c 100644
--- a/Indexer/Indexer/Utils/Preprocessor.cs
+++ b/Indexer/Indexer/Utils/Preprocessor.cs
@@ -1,4 +1,5 @@
-using Indexer.Models;
+using Indexer.Indexer.Services;
+using Indexer.Models;
 using Indexer.Models.Enums;
 using Indexer.Models.Models;
 using Iveonik.Stemmers;
@@ -16,6 +17,10 @@ namespace Indexer.Indexer
     public class Preprocessor
     {
 
+        private readonly IDictionaryService _dictionary;
+
+        public Preprocessor(IDictionaryService dictionary) => (_dictionary) = (dictionary);
+
         public HashSet<string> Process(BlogEntry entry)
         {
             string content = RemoveTags(entry.Content);
@@ -80,9 +85,14 @@ namespace Indexer.Indexer
         private HashSet<string> StemmTokens(IStemmer stemmer, params string[] words)
         {
             HashSet<string> stems = new HashSet<string>();
+            int id = 0;
             foreach (string word in words)
             {
-                stems.Add(stemmer.Stem(word));
+                string stem = stemmer.Stem(word).Trim();
+                stems.Add(stem);
+                id = _dictionary.GetLastId();
+                id++;
+                _dictionary.AddToDictionary(stem, id);
             }
             return stems;
         }
-- 
GitLab


From 9a127a72235d8e25ad54a6c04defbbcfb4d9076f Mon Sep 17 00:00:00 2001
From: Radek Mrvec <radek.mrvec@sumanet.cz>
Date: Sun, 2 May 2021 19:16:01 +0200
Subject: [PATCH 6/6] refs #8332 Indexing engine

---
 Indexer/Indexer/Application.cs                | 18 ++++++++---
 Indexer/Indexer/Services/DictionaryService.cs |  5 ++++
 .../Indexer/Services/IDictionaryService.cs    |  2 ++
 Indexer/Indexer/Services/IIndexerService.cs   |  2 ++
 Indexer/Indexer/Services/IndexerService.cs    | 30 +++++++++++++++++++
 5 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/Indexer/Indexer/Application.cs b/Indexer/Indexer/Application.cs
index 1e80d09..fa29a9d 100644
--- a/Indexer/Indexer/Application.cs
+++ b/Indexer/Indexer/Application.cs
@@ -27,25 +27,35 @@ namespace Indexer.Indexer
             FileProcessor fileProcessor = new();
             List<string> files = fileProcessor.GetFiles(args);
 
-            Console.WriteLine($"Going to preprocess {files.Count} files.");
+            Console.WriteLine($"Going to process {files.Count} files.");
 
-            PreprocessFiles(files);
+            ProcessFiles(files);
         }
 
-        private void PreprocessFiles(List<string> files)
+        private void ProcessFiles(List<string> files)
         {
             int tokensCount = 0;
+            Console.WriteLine($"Starting processing and indexing files.");
             foreach (var item in files)
             {
                 string content = File.ReadAllText(item);
 
                 HashSet<string> tokens = GetTokens(content);
-                tokensCount += tokens.Count;
+                IndexFile(tokens, files.IndexOf(item));
+                tokensCount += tokens.Count;              
             }
             Console.WriteLine($"All files has been preprocessed and {tokensCount} tokens created.");
             Console.WriteLine($"Dictionary with {_dictionary.GetLastId()} words created.");
         }
 
+        private void IndexFile(HashSet<string> tokens, int file)
+        {
+            foreach (var item in tokens)
+            {
+                _indexer.AddToIndex(_dictionary.GetWordId(item), file);
+            }
+        }
+
         private HashSet<string> GetTokens(string content)
         {
             BlogEntry entry = JsonSerializer.Deserialize<BlogEntry>(content);
diff --git a/Indexer/Indexer/Services/DictionaryService.cs b/Indexer/Indexer/Services/DictionaryService.cs
index 17ff9cc..43f392e 100644
--- a/Indexer/Indexer/Services/DictionaryService.cs
+++ b/Indexer/Indexer/Services/DictionaryService.cs
@@ -18,6 +18,11 @@ namespace Indexer.Indexer.Services
             }
         }
 
+        public int GetWordId(string word)
+        {
+            return idxDictionary.GetValueOrDefault(word);
+        }
+
         public int GetLastId()
         {
             return idxDictionary.Count;
diff --git a/Indexer/Indexer/Services/IDictionaryService.cs b/Indexer/Indexer/Services/IDictionaryService.cs
index ddd4d0c..8d38c79 100644
--- a/Indexer/Indexer/Services/IDictionaryService.cs
+++ b/Indexer/Indexer/Services/IDictionaryService.cs
@@ -10,6 +10,8 @@ namespace Indexer.Indexer.Services
     {
         public void AddToDictionary(string key, int value);
 
+        public int GetWordId(string word);
+
         public int GetLastId();
     }
 }
diff --git a/Indexer/Indexer/Services/IIndexerService.cs b/Indexer/Indexer/Services/IIndexerService.cs
index 292a712..d80329e 100644
--- a/Indexer/Indexer/Services/IIndexerService.cs
+++ b/Indexer/Indexer/Services/IIndexerService.cs
@@ -8,6 +8,8 @@ namespace Indexer.Indexer.Services
 {
     public interface IIndexerService
     {
+        public void AddToIndex(int word, int file);
 
+        public void ExportIndex();
     }
 }
diff --git a/Indexer/Indexer/Services/IndexerService.cs b/Indexer/Indexer/Services/IndexerService.cs
index 1bb88f9..e29e706 100644
--- a/Indexer/Indexer/Services/IndexerService.cs
+++ b/Indexer/Indexer/Services/IndexerService.cs
@@ -9,6 +9,36 @@ namespace Indexer.Indexer.Utils
 {
     public class IndexerService : IIndexerService
     {
+        private Dictionary<int, LinkedList<int>> idx = new();
+
+        public void AddToIndex(int word, int file)
+        {
+            if(idx.ContainsKey(word))
+            {
+                var item = idx.GetValueOrDefault(word);
+                item.AddLast(file);
+            }
+            else
+            {
+                LinkedList<int> postList = new LinkedList<int>();
+                postList.AddLast(file);
+                idx.Add(word, new LinkedList<int>(postList));
+            }
+        }
+
+
+        public void ExportIndex()
+        {
+            foreach(var item in idx)
+            {
+                Console.Write($"{item.Key} --> ");
+                foreach(var file in item.Value)
+                {
+                    Console.Write($"{file} ");
+                }
+                Console.Write("\n");
+            }
+        }
 
     }
 }
-- 
GitLab