From 552c47d28e1b53ba92b16407f4b888593ad2d233 Mon Sep 17 00:00:00 2001 From: Radek Mrvec <radek.mrvec@sumanet.cz> Date: Tue, 20 Apr 2021 23:17:36 +0200 Subject: [PATCH 1/6] Natazeni souboru dat z json souboru refs #8331 @2h --- .gitignore | 5 ++ Indexer/Indexer.sln | 14 ++++- Indexer/Indexer/Indexer.csproj | 16 +++++ Indexer/Indexer/Preprocessor.cs | 13 ++++ Indexer/Indexer/Program.cs | 60 +++++++++++++++++++ Indexer/Models/Models.csproj | 7 +++ .../Models/BlogEntriesRepository.cs | 0 .../Models/BlogEntry.cs | 0 .../Models/IRepository.cs | 0 Indexer/WebCrawler/WebCrawler.csproj | 4 ++ 10 files changed, 118 insertions(+), 1 deletion(-) create mode 100644 Indexer/Indexer/Indexer.csproj create mode 100644 Indexer/Indexer/Preprocessor.cs create mode 100644 Indexer/Indexer/Program.cs create mode 100644 Indexer/Models/Models.csproj rename Indexer/{WebCrawler => Models}/Models/BlogEntriesRepository.cs (100%) rename Indexer/{WebCrawler => Models}/Models/BlogEntry.cs (100%) rename Indexer/{WebCrawler => Models}/Models/IRepository.cs (100%) diff --git a/.gitignore b/.gitignore index 0df7638..6829651 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,8 @@ /Indexer/WebCrawler/obj /Indexer/WebCrawler/bin/Debug/net5.0 /.vs +/Indexer/Indexer/bin/Debug/net5.0 +/Indexer/Indexer/obj/Debug/net5.0 +/Indexer/Indexer/obj +/Indexer/Models/bin/Debug/net5.0 +/Indexer/Models/obj diff --git a/Indexer/Indexer.sln b/Indexer/Indexer.sln index d76b612..e33cac9 100644 --- a/Indexer/Indexer.sln +++ b/Indexer/Indexer.sln @@ -3,7 +3,11 @@ Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio Version 16 VisualStudioVersion = 16.0.31112.23 MinimumVisualStudioVersion = 10.0.40219.1 -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "WebCrawler", "WebCrawler\WebCrawler.csproj", "{D0ED9338-791A-428D-AC37-E41210B6DAF4}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "WebCrawler", "WebCrawler\WebCrawler.csproj", "{D0ED9338-791A-428D-AC37-E41210B6DAF4}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Indexer", "Indexer\Indexer.csproj", "{B597653B-2773-48B2-BF4A-29D150450AD9}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Models", "Models\Models.csproj", "{A6FA6649-2F36-47D8-9B26-B178E5BB2EEB}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution @@ -15,6 +19,14 @@ Global {D0ED9338-791A-428D-AC37-E41210B6DAF4}.Debug|Any CPU.Build.0 = Debug|Any CPU {D0ED9338-791A-428D-AC37-E41210B6DAF4}.Release|Any CPU.ActiveCfg = Release|Any CPU {D0ED9338-791A-428D-AC37-E41210B6DAF4}.Release|Any CPU.Build.0 = Release|Any CPU + {B597653B-2773-48B2-BF4A-29D150450AD9}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {B597653B-2773-48B2-BF4A-29D150450AD9}.Debug|Any CPU.Build.0 = Debug|Any CPU + {B597653B-2773-48B2-BF4A-29D150450AD9}.Release|Any CPU.ActiveCfg = Release|Any CPU + {B597653B-2773-48B2-BF4A-29D150450AD9}.Release|Any CPU.Build.0 = Release|Any CPU + {A6FA6649-2F36-47D8-9B26-B178E5BB2EEB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {A6FA6649-2F36-47D8-9B26-B178E5BB2EEB}.Debug|Any CPU.Build.0 = Debug|Any CPU + {A6FA6649-2F36-47D8-9B26-B178E5BB2EEB}.Release|Any CPU.ActiveCfg = Release|Any CPU + {A6FA6649-2F36-47D8-9B26-B178E5BB2EEB}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/Indexer/Indexer/Indexer.csproj b/Indexer/Indexer/Indexer.csproj new file mode 100644 index 0000000..1060521 --- /dev/null +++ b/Indexer/Indexer/Indexer.csproj @@ -0,0 +1,16 @@ +<Project Sdk="Microsoft.NET.Sdk"> + + <PropertyGroup> + <OutputType>Exe</OutputType> + <TargetFramework>net5.0</TargetFramework> + </PropertyGroup> + + <ItemGroup> + <ProjectReference Include="..\Models\Models.csproj" /> + </ItemGroup> + + <ItemGroup> + <Folder Include="Utils\" /> + </ItemGroup> + +</Project> diff --git a/Indexer/Indexer/Preprocessor.cs b/Indexer/Indexer/Preprocessor.cs new file mode 100644 index 0000000..196b376 --- /dev/null +++ b/Indexer/Indexer/Preprocessor.cs @@ -0,0 +1,13 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Indexer +{ + public class Preprocessor + { + + } +} diff --git a/Indexer/Indexer/Program.cs b/Indexer/Indexer/Program.cs new file mode 100644 index 0000000..922505f --- /dev/null +++ b/Indexer/Indexer/Program.cs @@ -0,0 +1,60 @@ + +using System; +using System.IO; +using System.Text.Json; +using WebCrawler.Models; + +namespace Indexer +{ + class Program + { + + + static void Main(string[] args) + { + new Program().Run(args); + } + + public void Run(string[] args) + { + foreach (string path in args) + { + if (File.Exists(path)) + { + // This path is a file + ProcessFile(path); + } + else if (Directory.Exists(path)) + { + // This path is a directory + ProcessDirectory(path); + } + else + { + Console.WriteLine("{0} is not a valid file or directory.", path); + } + } + } + + public static void ProcessDirectory(string targetDirectory) + { + // Process the list of files found in the directory. + string[] fileEntries = Directory.GetFiles(targetDirectory); + foreach (string fileName in fileEntries) + ProcessFile(fileName); + + // Recurse into subdirectories of this directory. + string[] subdirectoryEntries = Directory.GetDirectories(targetDirectory); + foreach (string subdirectory in subdirectoryEntries) + ProcessDirectory(subdirectory); + } + + // Insert logic for processing found files here. + public static void ProcessFile(string path) + { + string contents = File.ReadAllText(path); + BlogEntry entry = JsonSerializer.Deserialize<BlogEntry>(contents); + Console.WriteLine("Processed file '{0}'.", path); + } + } +} diff --git a/Indexer/Models/Models.csproj b/Indexer/Models/Models.csproj new file mode 100644 index 0000000..563e6f9 --- /dev/null +++ b/Indexer/Models/Models.csproj @@ -0,0 +1,7 @@ +<Project Sdk="Microsoft.NET.Sdk"> + + <PropertyGroup> + <TargetFramework>net5.0</TargetFramework> + </PropertyGroup> + +</Project> diff --git a/Indexer/WebCrawler/Models/BlogEntriesRepository.cs b/Indexer/Models/Models/BlogEntriesRepository.cs similarity index 100% rename from Indexer/WebCrawler/Models/BlogEntriesRepository.cs rename to Indexer/Models/Models/BlogEntriesRepository.cs diff --git a/Indexer/WebCrawler/Models/BlogEntry.cs b/Indexer/Models/Models/BlogEntry.cs similarity index 100% rename from Indexer/WebCrawler/Models/BlogEntry.cs rename to Indexer/Models/Models/BlogEntry.cs diff --git a/Indexer/WebCrawler/Models/IRepository.cs b/Indexer/Models/Models/IRepository.cs similarity index 100% rename from Indexer/WebCrawler/Models/IRepository.cs rename to Indexer/Models/Models/IRepository.cs diff --git a/Indexer/WebCrawler/WebCrawler.csproj b/Indexer/WebCrawler/WebCrawler.csproj index 8f52e34..324450c 100644 --- a/Indexer/WebCrawler/WebCrawler.csproj +++ b/Indexer/WebCrawler/WebCrawler.csproj @@ -10,4 +10,8 @@ <PackageReference Include="System.Net.Http" Version="4.3.4" /> </ItemGroup> + <ItemGroup> + <ProjectReference Include="..\Models\Models.csproj" /> + </ItemGroup> + </Project> -- GitLab From afae65953163dbd91b446c8e68d03bd824c1e047 Mon Sep 17 00:00:00 2001 From: Radek Mrvec <radek.mrvec@sumanet.cz> Date: Thu, 22 Apr 2021 20:19:54 +0200 Subject: [PATCH 2/6] fixed #8698 escaping in JSON file and also added downloading of whole HTML page f --- .gitignore | 376 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 363 insertions(+), 13 deletions(-) diff --git a/.gitignore b/.gitignore index 6829651..7d8da62 100644 --- a/.gitignore +++ b/.gitignore @@ -1,13 +1,363 @@ -################################################################################ -# This .gitignore file was automatically created by Microsoft(R) Visual Studio. -################################################################################ - -/Indexer/.vs/Indexer/v16 -/Indexer/WebCrawler/obj -/Indexer/WebCrawler/bin/Debug/net5.0 -/.vs -/Indexer/Indexer/bin/Debug/net5.0 -/Indexer/Indexer/obj/Debug/net5.0 -/Indexer/Indexer/obj -/Indexer/Models/bin/Debug/net5.0 -/Indexer/Models/obj +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. +## +## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore + +# User-specific files +*.rsuser +*.suo +*.user +*.userosscache +*.sln.docstates + +# User-specific files (MonoDevelop/Xamarin Studio) +*.userprefs + +# Mono auto generated files +mono_crash.* + +# Build results +[Dd]ebug/ +[Dd]ebugPublic/ +[Rr]elease/ +[Rr]eleases/ +x64/ +x86/ +[Ww][Ii][Nn]32/ +[Aa][Rr][Mm]/ +[Aa][Rr][Mm]64/ +bld/ +[Bb]in/ +[Oo]bj/ +[Oo]ut/ +[Ll]og/ +[Ll]ogs/ + +# Visual Studio 2015/2017 cache/options directory +.vs/ +# Uncomment if you have tasks that create the project's static files in wwwroot +#wwwroot/ + +# Visual Studio 2017 auto generated files +Generated\ Files/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +# NUnit +*.VisualState.xml +TestResult.xml +nunit-*.xml + +# Build Results of an ATL Project +[Dd]ebugPS/ +[Rr]eleasePS/ +dlldata.c + +# Benchmark Results +BenchmarkDotNet.Artifacts/ + +# .NET Core +project.lock.json +project.fragment.lock.json +artifacts/ + +# ASP.NET Scaffolding +ScaffoldingReadMe.txt + +# StyleCop +StyleCopReport.xml + +# Files built by Visual Studio +*_i.c +*_p.c +*_h.h +*.ilk +*.meta +*.obj +*.iobj +*.pch +*.pdb +*.ipdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*_wpftmp.csproj +*.log +*.vspscc +*.vssscc +.builds +*.pidb +*.svclog +*.scc + +# Chutzpah Test files +_Chutzpah* + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opendb +*.opensdf +*.sdf +*.cachefile +*.VC.db +*.VC.VC.opendb + +# Visual Studio profiler +*.psess +*.vsp +*.vspx +*.sap + +# Visual Studio Trace Files +*.e2e + +# TFS 2012 Local Workspace +$tf/ + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper +*.DotSettings.user + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# AxoCover is a Code Coverage Tool +.axoCover/* +!.axoCover/settings.json + +# Coverlet is a free, cross platform Code Coverage Tool +coverage*.json +coverage*.xml +coverage*.info + +# Visual Studio code coverage results +*.coverage +*.coveragexml + +# NCrunch +_NCrunch_* +.*crunch*.local.xml +nCrunchTemp_* + +# MightyMoose +*.mm.* +AutoTest.Net/ + +# Web workbench (sass) +.sass-cache/ + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.[Pp]ublish.xml +*.azurePubxml +# Note: Comment the next line if you want to checkin your web deploy settings, +# but database connection strings (with potential passwords) will be unencrypted +*.pubxml +*.publishproj + +# Microsoft Azure Web App publish settings. Comment the next line if you want to +# checkin your Azure Web App publish settings, but sensitive information contained +# in these scripts will be unencrypted +PublishScripts/ + +# NuGet Packages +*.nupkg +# NuGet Symbol Packages +*.snupkg +# The packages folder can be ignored because of Package Restore +**/[Pp]ackages/* +# except build/, which is used as an MSBuild target. +!**/[Pp]ackages/build/ +# Uncomment if necessary however generally it will be regenerated when needed +#!**/[Pp]ackages/repositories.config +# NuGet v3's project.json files produces more ignorable files +*.nuget.props +*.nuget.targets + +# Microsoft Azure Build Output +csx/ +*.build.csdef + +# Microsoft Azure Emulator +ecf/ +rcf/ + +# Windows Store app package directories and files +AppPackages/ +BundleArtifacts/ +Package.StoreAssociation.xml +_pkginfo.txt +*.appx +*.appxbundle +*.appxupload + +# Visual Studio cache files +# files ending in .cache can be ignored +*.[Cc]ache +# but keep track of directories ending in .cache +!?*.[Cc]ache/ + +# Others +ClientBin/ +~$* +*~ +*.dbmdl +*.dbproj.schemaview +*.jfm +*.pfx +*.publishsettings +orleans.codegen.cs + +# Including strong name files can present a security risk +# (https://github.com/github/gitignore/pull/2483#issue-259490424) +#*.snk + +# Since there are multiple workflows, uncomment next line to ignore bower_components +# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) +#bower_components/ + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file +# to a newer Visual Studio version. Backup files are not needed, +# because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm +ServiceFabricBackup/ +*.rptproj.bak + +# SQL Server files +*.mdf +*.ldf +*.ndf + +# Business Intelligence projects +*.rdl.data +*.bim.layout +*.bim_*.settings +*.rptproj.rsuser +*- [Bb]ackup.rdl +*- [Bb]ackup ([0-9]).rdl +*- [Bb]ackup ([0-9][0-9]).rdl + +# Microsoft Fakes +FakesAssemblies/ + +# GhostDoc plugin setting file +*.GhostDoc.xml + +# Node.js Tools for Visual Studio +.ntvs_analysis.dat +node_modules/ + +# Visual Studio 6 build log +*.plg + +# Visual Studio 6 workspace options file +*.opt + +# Visual Studio 6 auto-generated workspace file (contains which files were open etc.) +*.vbw + +# Visual Studio LightSwitch build output +**/*.HTMLClient/GeneratedArtifacts +**/*.DesktopClient/GeneratedArtifacts +**/*.DesktopClient/ModelManifest.xml +**/*.Server/GeneratedArtifacts +**/*.Server/ModelManifest.xml +_Pvt_Extensions + +# Paket dependency manager +.paket/paket.exe +paket-files/ + +# FAKE - F# Make +.fake/ + +# CodeRush personal settings +.cr/personal + +# Python Tools for Visual Studio (PTVS) +__pycache__/ +*.pyc + +# Cake - Uncomment if you are using it +# tools/** +# !tools/packages.config + +# Tabs Studio +*.tss + +# Telerik's JustMock configuration file +*.jmconfig + +# BizTalk build output +*.btp.cs +*.btm.cs +*.odx.cs +*.xsd.cs + +# OpenCover UI analysis results +OpenCover/ + +# Azure Stream Analytics local run output +ASALocalRun/ + +# MSBuild Binary and Structured Log +*.binlog + +# NVidia Nsight GPU debugger configuration file +*.nvuser + +# MFractors (Xamarin productivity tool) working folder +.mfractor/ + +# Local History for Visual Studio +.localhistory/ + +# BeatPulse healthcheck temp database +healthchecksdb + +# Backup folder for Package Reference Convert tool in Visual Studio 2017 +MigrationBackup/ + +# Ionide (cross platform F# VS Code tools) working folder +.ionide/ + +# Fody - auto-generated XML schema +FodyWeavers.xsd \ No newline at end of file -- GitLab From add9dae90cf211ed9057ef9f0ce19d01b23e9f52 Mon Sep 17 00:00:00 2001 From: Radek Mrvec <radek.mrvec@sumanet.cz> Date: Sat, 24 Apr 2021 22:54:09 +0200 Subject: [PATCH 3/6] refs #8331 finished preprocessor with stemmer. --- .gitignore | 2 +- .../.vs/Indexer/DesignTimeBuild/.dtbcache.v2 | Bin 60901 -> 78512 bytes Indexer/Indexer.sln | 6 +- Indexer/Indexer/Indexer.Indexer.csproj | 18 ++++ Indexer/Indexer/Indexer.csproj | 20 +--- Indexer/Indexer/Program.cs | 11 ++- .../Indexer/Properties/launchSettings.json | 8 ++ Indexer/Indexer/Utils/Preprocessor.cs | 90 ++++++++++++++++++ .../Enums/IdxLanguage.cs} | 7 +- .../{Models.csproj => Indexer.Models.csproj} | 2 +- .../Models/Models/BlogEntriesRepository.cs | 2 +- Indexer/Models/Models/TextData.cs | 13 +++ Indexer/Models/Models/TextTokens.cs | 13 +++ Indexer/Models/{Models => Repos}/BlogEntry.cs | 2 +- .../Models/{Models => Repos}/IRepository.cs | 2 +- Indexer/WebCrawler/GamasutraCrawl.cs | 55 +++++++---- Indexer/WebCrawler/GamasutraParser.cs | 4 +- ...awler.csproj => Indexer.WebCrawler.csproj} | 5 +- Indexer/WebCrawler/Program.cs | 28 ++++-- Indexer/WebCrawler/Utils/HtmlDownloader.cs | 2 +- 20 files changed, 232 insertions(+), 58 deletions(-) create mode 100644 Indexer/Indexer/Indexer.Indexer.csproj create mode 100644 Indexer/Indexer/Properties/launchSettings.json create mode 100644 Indexer/Indexer/Utils/Preprocessor.cs rename Indexer/{Indexer/Preprocessor.cs => Models/Enums/IdxLanguage.cs} (56%) rename Indexer/Models/{Models.csproj => Indexer.Models.csproj} (68%) create mode 100644 Indexer/Models/Models/TextData.cs create mode 100644 Indexer/Models/Models/TextTokens.cs rename Indexer/Models/{Models => Repos}/BlogEntry.cs (86%) rename Indexer/Models/{Models => Repos}/IRepository.cs (86%) rename Indexer/WebCrawler/{WebCrawler.csproj => Indexer.WebCrawler.csproj} (53%) diff --git a/.gitignore b/.gitignore index 7d8da62..43d9deb 100644 --- a/.gitignore +++ b/.gitignore @@ -360,4 +360,4 @@ MigrationBackup/ .ionide/ # Fody - auto-generated XML schema -FodyWeavers.xsd \ No newline at end of file +FodyWeavers.xsd diff --git a/Indexer/.vs/Indexer/DesignTimeBuild/.dtbcache.v2 b/Indexer/.vs/Indexer/DesignTimeBuild/.dtbcache.v2 index 157bdf0469ee702a9156b45762c5d6311cdcb60d..c912c6df6fdd87ce37466d9e2240b2a99f4a3a71 100644 GIT binary patch delta 7501 zcmeH~e{2+G8pmg*R74Mo{IDWW1nd<KX?J(JZC4IyEu}0_OA7^o7Tlektqa@TvOlb~ z;FKcAaq7V`aEczMoH51_&odk2dFmPC8Bjvd7-Nj_j4{MhLwM$%7|*EjzVCDwN^xZ` zx&C!*`h7p&d4J6NzVp8Gyq(h@7X112DEW^mxw*DF7<5+#oq<42mEYs6^!VJWLv`rE zfV)!FG>=QweXbhKU+Jn0=s}O>*WFbf)urif-J@%6w<opldU;FgsN?6}NI<_^kG1<l zk@m&9KiSzH(G%`+M|-BA+-B;BYsbwRo^w+{{mt#t^b4}}Sa~cNNrbv|EgA`iI;Fz$ z&GBeN@}jS)nVJ{(B`rz)X?~$yZ2m*8kzHo7(qX4Jm<N>-xgD)5v#q24%|9_x3?L8= zOQRC8q%Mhe>$B~CdWM;H6En33r=7_x8kQ-IvirDTP8W@8$ehx}qjt)&YTh<5$9%o8 z46b`}X?pw>(~|OfjIg088qmY>iyuq2+re_X&xQ;0Zn|NO9*c*fk%dlk-_PchMv|EO zjPmmK%q=ghU9QEuW6{m#yuQFysx;bJ?sS%`s(GkyiL9pUw_%Jf^VoI+Y2&@G$Qv;< zN3Nm8^x(N2GGut0(Z-W$bI}v|(ZQ70N4tCLqg~x$Jz*-wd&Z49+hyB}^}Teh?tovq z@sevUYL&F=@o4xi9iwQ~(~bRCD1&Pyhs>gZ>r#K2{fqpcLVD{!TZ1yU!nBS}o}3+U zmSW%z^USg7W3xFn%?Za>$?MJF@tMQ&_a66SE&g@<KBPTwK47oQ>Y`av(aM;Ph1B$T zJQ_1Ud#h!J<EI~**?MAbeXU=uL6mSgD%~!<%I8vjZVe&A=?>`5V3itlR%xDq8t_;8 z5L^8C)i~W%)xJufrn@RDRn;Sj&AKwr)fID{ke%j`9Ww61JZUNJd3~;XN)3n$M$XdN zIcGhj!%J2YTVQt1jnWjoTD+X)qShRHStngM1N*hC5UX1HBLzhj316&JPuv_1`D4CV zue46A^RS0trjE}u54<(gn9!<sN8|0chNGPgkwmN)FRV0aFwbivl!y-IR$OZ8b@8~~ z<q!9IBf+S+O|9;5)l|AP4^9Kk6?E%0T8+P2^97wXs^->%j+&r4!(Uykx;3{Jbh~_- z!{HA215TGaSm~(pc-+#~)X^1JX6Kmtx@63`3FBhs4cU^ly1zc=>j@7(3e0u#BeFkr zbb)zRx#qebsV%k)%2zDTQFd*P{Bzsve9zJf(a))Z4x=pRgF0W^Uc$U8id|5abFU1! zqo{aOMKo#hv5C3ns)|jMtFS}tlXFko+<n|_u79gcEpAJMa8xW!bcGj3dm`beFM#Ka z^;L7u(_D69YP#=4sdCHcm5BbjmXM~2pt~Zf`NFGw2|XMRCG^aG3F&dExK$78F+HN` z&B;V}GLb1VUp`rDK7O*q{O;s7`8K?^*%d!`5M<Y9@R$*6;3&zwyxF46i<zmh8mnYv zi-tVC!Q!#f;BOqDE0Pq1I-xtE5VRTE0)?S2C;~;HZfGkMgT|w692D&n(8c#KE`5_J zxJeR??}B=uyP;l4-0&W#584K8hwg<8s2|z^4M01g`=DKr804>@-O#V0J)-h&p!=ce znT{TS7o9x_4eju^xE7seJIlADy=b!!N<sS}yJJxFYlx1EG98=nqJsmF=tPWuc;h|D z%MSP;{O=$c?L`NI4}k@sXe(wb>Wk}#GuMZ}D{%b?h`w5-M`1+e-$SDE7!-<u<QhbF z0H`7zM)C(pbnuwSz^^?$@b=#)87XVEyqN8u)h5#fYmGd3g%m~SJX>q!{qjERcBCn5 zo!lTwto4HxNMUP(%vQT>b=c(_<t5f8*}4N44_iLD(elfd1_WeF2ZFNI3EU}LMhM(2 zTU&szY;^$<*@^<)vb7b>_ggV}DI>wq;=;v+OQ2+*m9*2lkbd_Fm3Wy-*y@qJ*4;9I z!P6|{&0;DAN}0-lGN!q}T&8)zJT_=PIG?Eks9<seP9_ylncRSzsT!ze@&F#D1;7HP zg}_3lMZhAaI-riJ9;jz(02-JYfkq}TfWPHJOie%&Q!~)aWVC=SOsjxZOl?3L(^_CH z(>h=s(*|GzQwPw&v<cY6<O6(68lW-hfX>tjbTWm25YrZ53sV=+#S{giOk07iOmQI2 zlr+F3QxDL?)C=@7^#OfM+kx#EB49A>0Cq6#1a>m*0(LR&26i*;0roK658Ti6An+j5 zUSKa%3P>@TfXQ?aILP!6@DS4>FK&E@=}}xh%5)ew%=8%W7}F8p2-6e56HHG6Pcl6P zJjL`h@HEp=;3(5Gz%xwG0?#r%2Rz4=2GUH=1J5(P0KCBTBJd*9OTbG^uXwTkuQ0ue z%U79R172f#9eADT4d4x?W56+{H-R^qP5>vEP68*HP64Nw-T~fWdKY+?={?{*ruTvO znLYqMVEPdFkm)1fBc_jmkC{FJj8B+81wUmv4V-5B4ET)cbKrBPFM%(a&H!ha&H`td zEWl!7fSJAmzGC_s_?qb(;2WlIfp3|<2fk<e0r-I}S@w#AtpZ^RU<wW43gJczGa6=$ zFk@h@7UpW0vBHdnDH5g#<{DwHfte`GM3~9KOok~IrWj_5FjHWr3NsaEx-ipWW(YF_ zW~OYw&4imRvS!1S3R4PGCQKR3Tw&(I%oAoF%zR<y!&C@U0pk?L38M<5!nlQT!&D1X z4dW5U1G7Mw1uzR`>_4PzEfQIaVCsaagQ*v$9;QK<2AD=+8ezP`cwv?cvmB;Lm?oHJ zVVYrDglU24MRZD88|7t;#JUGLeIkHuvqRT*`In+}gS`BFEM@CnlpUanE0AZ%BJH=+ z9d<fkH{6NzyT7K-e(_IbZGZpcnUMasX_7?J0Li3Kfpv@W?+H{W?!QR66<w~DVXRsO zDP3vRDRixMt76q7bR%O5T~AYKJWWGfrRli&!|74)ES_YX5BmH5+tB{Mi0p<KNgiQ* zq@uWyXBcn63)V6MQQQdS7sk<jawMWS!gF3>e59hdk!KhmsVHvb8OBE{iX%Mc6~;xh z8Hp%v<Qc}%{fI<yBhN5CQc=7u&oD0j8@0nWqO6G6ixrVJ*y$2G9TK-+G|ztFOwakU zTN%@+)LKg+uX*ImL}M9T6;ut?K+B;OP!qHOx(!;W)b{ru$pri!d<vZp_|2Jszu#J^ z)InB@f)ug7RiUe_Rf@G5u^t&?X&e>Nc&p7m7S<>*)>@<!MBUrPxv);L)}wBll0Nlz z&*g7~kvd)SJX|h+BV7JQxcrUq|ND)wFxNMNwZ=YLHrQ#qopuaAY+lAk!qZCOMjRcR d%)@7j=idQ0^pNr4+7I0fX^=Q*0+4;s{0EVYm+}Ar delta 2721 zcmai!4NTN!9LMiDK+{U|g?uS6Ul<YN<=_rHnE?U<0TCgQatL?a9|R5#xg#Rc!_2kj zTHsSF=bCG+HP%|$t-03JxoXXP3$d~0T6501)>^-3zd3Wget)OP=CW@4{yv}Q|GfPl z{_OeqO8k|JacXEx)fVMB);ikra@<yq<@D$IvVHB&{5;F)bhuqUcb?bfvMg_QZoWUq znUm+U{EqDQ+-$$s=XSTX``0?0YsW^Ht4{5mDA<2~fo}aaStUuG6<4y(V_OPcs<*0p zRQ2?T#S_)~hFYy0I?3J~tB>e|_GGmM1F9cvw0tG~-ob#?pXm<-Z1Lg#0m~*`vU<)P zz4Pv9&z))8;$!n=*J{0ZVx1m6zc9KkzE@$#@q}8`6SGpH$+I3F(0}b{o9nR6c4Rsn znYj+#J-ji;VN0wZ3Wcqn%qOjIW<@yMm*vdN&2-ulZeso&y|x^^Yd9$}baPHG=4`_p zPi%1Xli@wGYm)sD-E^uzkL_A>yR}B&clrT6H`1vKM<zWhg1#<KspT8!@L&m7X0~VM zJh9XK=YT%H&$hzX+57J+M0f69ZjYV)QPrg{jW_7p*>j`&UZT>}HCe%d@VfuoJ6#_> zn{?0g3+aiI^{PRio?NM~O}g~RH?G*3mg=+L1a6;h#^js}^<x)O7cQs>_XJ9VgS~;E z*N>0f7Sh${R^=qs1pQVZ<k=7ic0A?nutHB;eZf#?IM_dgA-2R@-IX(sb^7VL<>_~Q zYklx^TJ+F`6uaIqm8|2Z9@ni?!#Dp4Bh!b7ZQs`)?7Gi(^EVGDw%cqQkP@U6DML0Q z<wylmi9CT+A=O9?@+9&A=GKBT|0(3|Yeeq8$TBQahsjSP^~ffq0g(ltK^l=~k<G{! zqzUmL&B#`y1!+aRh`gi^X+!*og|s6b5tRA2HrfW4#XAvMSQhC*x{(0VgWR47??r-$ z!h9Jc>;vOKnJb&hcxi9H)$Rvpp&bHcuevab3?4va@Eml?i`?xcre6R<Y|o(#BC_%G z@&LcqX#B-57e_|TCbeIUng-PU<{4GW_?bo}V6wA$R<YTv%$8d{O*1_n^fW886*I?7 ziz+j%%6QRu%lOnr)256cu#{;BI+WQ4bSl#YbSo18dX$Ouf<a~afbGik$KE2O%Grbz zEi_@Vu-E{4M$L1vI*9tm0ZOi5N@iLGtYS(5QkYVKRHif_jVT>SXUYIFn6iK@CI{eP z$^~*GOfJyHln>-HxdAs*0Z_oS9$3#*2oy3E1I0`wKnYVBP{vdalrvQVl}uGY6;lmR z!&D2@GSvZfO!YuLQv=Yz)Ce?kgf@elnVNtmre>g-sRd|Z@&aC_HlU5k0xYHupo6Is z=w#{!x|w=_9;P4=WZDjFX9@u!rU77pX%HA>8Uluxb^tq=B0IsIAb`KcZeTal9$*jC zUSKcN%fQP_uK=$w?F05Py$Za_^cwIQ(-<(uqyf$J2Ji;co4}h)2Y>@i2Z4i3Zv$^L z9Rdz9y(9a+!}KojF4KF!dra>G?=y`9<4hj_A259ge8}_>@DbBt;4o7Zh%$W&e9CkL zIKuQ9@EOzRz~@Xyful^vf#XakfD=q#0ADbDRe>{q#q>3rUo)KqPBNVWPBEPU&M=(= z&M{2@6HJr9B-3}mcT7{j6w~*>_e>Xoi%dTNKQR3W{K#|(xWsfBxXg3~xWaT5xXScX z1-}1JOh2RfGt)1?FHFAzzcLxXFfqVP*MMtG*MaLyH-H;VzXQKB{R#ZZ^f&M~8>K3w zu!$3igCvL~KxT`~hRhL}1GzUMb}#IHk^3R@Mdm{mh%A6C5?KUE6iI|E5m^FRCbA6j zkjO)jB#|V@3Xv6%l_D!4t3_5rQbbZ9sS4j8l}(yFN`s_}q(d@9G9XzZSrCVa1ClF} z3vr3KAo(Ks5VwdMQXo<QSue64QYcagDHbV)lqh_ER5oSus0>mrQVyvUsf1LCR6%M) zY9O^DwU9cII>__5kmL%iWE3-mCojmIwIg<04WsVIHTuNNy~@TDYc?q-Zdv@yP7HdT x7F9uaDXEcI9f{T5v2|WV{o|O~ql%6k%TnWZz3R8ZdFF9@ktsxq>_sCZ{{UtD+!O!+ diff --git a/Indexer/Indexer.sln b/Indexer/Indexer.sln index e33cac9..1973e44 100644 --- a/Indexer/Indexer.sln +++ b/Indexer/Indexer.sln @@ -3,11 +3,11 @@ Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio Version 16 VisualStudioVersion = 16.0.31112.23 MinimumVisualStudioVersion = 10.0.40219.1 -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "WebCrawler", "WebCrawler\WebCrawler.csproj", "{D0ED9338-791A-428D-AC37-E41210B6DAF4}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Indexer.WebCrawler", "WebCrawler\Indexer.WebCrawler.csproj", "{D0ED9338-791A-428D-AC37-E41210B6DAF4}" EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Indexer", "Indexer\Indexer.csproj", "{B597653B-2773-48B2-BF4A-29D150450AD9}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Indexer.Indexer", "Indexer\Indexer.Indexer.csproj", "{B597653B-2773-48B2-BF4A-29D150450AD9}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Models", "Models\Models.csproj", "{A6FA6649-2F36-47D8-9B26-B178E5BB2EEB}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Indexer.Models", "Models\Indexer.Models.csproj", "{A6FA6649-2F36-47D8-9B26-B178E5BB2EEB}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution diff --git a/Indexer/Indexer/Indexer.Indexer.csproj b/Indexer/Indexer/Indexer.Indexer.csproj new file mode 100644 index 0000000..4b89db9 --- /dev/null +++ b/Indexer/Indexer/Indexer.Indexer.csproj @@ -0,0 +1,18 @@ +<Project Sdk="Microsoft.NET.Sdk"> + + <PropertyGroup> + <OutputType>Exe</OutputType> + <TargetFramework>net5.0</TargetFramework> + </PropertyGroup> + + <ItemGroup> + <ProjectReference Include="..\Models\Indexer.Models.csproj" /> + </ItemGroup> + + <ItemGroup> + <PackageReference Include="HtmlAgilityPack" Version="1.11.33" /> + <PackageReference Include="Microsoft.ML" Version="1.5.5" /> + <PackageReference Include="StemmersNet.Standard" Version="1.1.1" /> + </ItemGroup> + +</Project> diff --git a/Indexer/Indexer/Indexer.csproj b/Indexer/Indexer/Indexer.csproj index 1060521..0f14913 100644 --- a/Indexer/Indexer/Indexer.csproj +++ b/Indexer/Indexer/Indexer.csproj @@ -1,16 +1,4 @@ -<Project Sdk="Microsoft.NET.Sdk"> - - <PropertyGroup> - <OutputType>Exe</OutputType> - <TargetFramework>net5.0</TargetFramework> - </PropertyGroup> - - <ItemGroup> - <ProjectReference Include="..\Models\Models.csproj" /> - </ItemGroup> - - <ItemGroup> - <Folder Include="Utils\" /> - </ItemGroup> - -</Project> +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup /> +</Project> \ No newline at end of file diff --git a/Indexer/Indexer/Program.cs b/Indexer/Indexer/Program.cs index 922505f..4e11e51 100644 --- a/Indexer/Indexer/Program.cs +++ b/Indexer/Indexer/Program.cs @@ -1,10 +1,11 @@  +using Indexer.Models; using System; +using System.Collections.Generic; using System.IO; using System.Text.Json; -using WebCrawler.Models; -namespace Indexer +namespace Indexer.Indexer { class Program { @@ -54,7 +55,11 @@ namespace Indexer { string contents = File.ReadAllText(path); BlogEntry entry = JsonSerializer.Deserialize<BlogEntry>(contents); - Console.WriteLine("Processed file '{0}'.", path); + + Preprocessor processor = new(); + HashSet<string> tokens = processor.Process(entry); + + Console.WriteLine("Processed file '{0}'. Number of tokens {1}", path, tokens.Count); } } } diff --git a/Indexer/Indexer/Properties/launchSettings.json b/Indexer/Indexer/Properties/launchSettings.json new file mode 100644 index 0000000..2d62df7 --- /dev/null +++ b/Indexer/Indexer/Properties/launchSettings.json @@ -0,0 +1,8 @@ +{ + "profiles": { + "Indexer.Indexer": { + "commandName": "Project", + "commandLineArgs": "downloaded_json" + } + } +} \ No newline at end of file diff --git a/Indexer/Indexer/Utils/Preprocessor.cs b/Indexer/Indexer/Utils/Preprocessor.cs new file mode 100644 index 0000000..3ccff14 --- /dev/null +++ b/Indexer/Indexer/Utils/Preprocessor.cs @@ -0,0 +1,90 @@ +using Indexer.Models; +using Indexer.Models.Enums; +using Indexer.Models.Models; +using Iveonik.Stemmers; +using Microsoft.ML; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; +using System.Threading.Tasks; +using System.Web; + +namespace Indexer.Indexer +{ + public class Preprocessor + { + + public HashSet<string> Process(BlogEntry entry) + { + string content = RemoveTags(entry.Content); + content = StripUnicodeCharactersFromString(content); + string[] tokens = CreateTokens(content, IdxLanguage.English); + return StemmTokens(new EnglishStemmer(), tokens); + } + + /// <summary> + /// Removing HTML tags from given string. + /// </summary> + /// <param name="input"></param> + /// <returns>String without HTML tags</returns> + private string RemoveTags(string input) + { + if (string.IsNullOrEmpty(input)) return null; + + HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); + doc.LoadHtml(input); + + return HttpUtility.HtmlDecode(doc.DocumentNode.InnerText); ; + } + + /// <summary> + /// Removing unicode codes and escaped chars from given string + /// </summary> + /// <param name="input"></param> + /// <returns>String without unicode and escaped chars</returns> + private string StripUnicodeCharactersFromString(string input) + { + string noUnicode = Regex.Replace(input, @"[\u0000-\u001F\u0080-\u009f\u0100-\uFFFF]", String.Empty); + return Regex.Replace(noUnicode, @"\t|\n|\r", ""); + } + + + /// <summary> + /// Creating tokens and and removing default stop words based on language. + /// </summary> + /// <param name="input"></param> + /// <returns></returns> + private string[] CreateTokens(string input, IdxLanguage lang) + { + var context = new MLContext(); + var emptyData = new List<TextData>(); + var data = context.Data.LoadFromEnumerable(emptyData); + + var tokenization = context.Transforms.Text.TokenizeIntoWords("Tokens", "Text", separators: new[] { ' ', '.', ',',')','(', '[', ']'}) + .Append(context.Transforms.Text.RemoveDefaultStopWords("Tokens", "Tokens", + Microsoft.ML.Transforms.Text.StopWordsRemovingEstimator.Language.English)); + + var stopWordsModel = tokenization.Fit(data); + var engine = context.Model.CreatePredictionEngine<TextData, TextTokens>(stopWordsModel); + return engine.Predict(new TextData { Text = input }).Tokens; + } + + /// <summary> + /// Going through all the tokens and using stemmer on them. + /// </summary> + /// <param name="stemmer"></param> + /// <param name="words"></param> + /// <returns></returns> + private HashSet<string> StemmTokens(IStemmer stemmer, params string[] words) + { + HashSet<string> stems = new HashSet<string>(); + foreach (string word in words) + { + stems.Add(stemmer.Stem(word)); + } + return stems; + } + } +} diff --git a/Indexer/Indexer/Preprocessor.cs b/Indexer/Models/Enums/IdxLanguage.cs similarity index 56% rename from Indexer/Indexer/Preprocessor.cs rename to Indexer/Models/Enums/IdxLanguage.cs index 196b376..b91ee3a 100644 --- a/Indexer/Indexer/Preprocessor.cs +++ b/Indexer/Models/Enums/IdxLanguage.cs @@ -4,10 +4,11 @@ using System.Linq; using System.Text; using System.Threading.Tasks; -namespace Indexer +namespace Indexer.Models.Enums { - public class Preprocessor + public enum IdxLanguage { - + English, + Czech } } diff --git a/Indexer/Models/Models.csproj b/Indexer/Models/Indexer.Models.csproj similarity index 68% rename from Indexer/Models/Models.csproj rename to Indexer/Models/Indexer.Models.csproj index 563e6f9..6e9ceba 100644 --- a/Indexer/Models/Models.csproj +++ b/Indexer/Models/Indexer.Models.csproj @@ -1,4 +1,4 @@ -<Project Sdk="Microsoft.NET.Sdk"> +<Project Sdk="Microsoft.NET.Sdk"> <PropertyGroup> <TargetFramework>net5.0</TargetFramework> diff --git a/Indexer/Models/Models/BlogEntriesRepository.cs b/Indexer/Models/Models/BlogEntriesRepository.cs index b940fbd..c5eb131 100644 --- a/Indexer/Models/Models/BlogEntriesRepository.cs +++ b/Indexer/Models/Models/BlogEntriesRepository.cs @@ -4,7 +4,7 @@ using System.Linq; using System.Text; using System.Threading.Tasks; -namespace WebCrawler.Models +namespace Indexer.Models { public class BlogEntriesRepository : IRepository { diff --git a/Indexer/Models/Models/TextData.cs b/Indexer/Models/Models/TextData.cs new file mode 100644 index 0000000..06ca52e --- /dev/null +++ b/Indexer/Models/Models/TextData.cs @@ -0,0 +1,13 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Indexer.Models +{ + public class TextData + { + public string Text { get; set; } + } +} diff --git a/Indexer/Models/Models/TextTokens.cs b/Indexer/Models/Models/TextTokens.cs new file mode 100644 index 0000000..03886be --- /dev/null +++ b/Indexer/Models/Models/TextTokens.cs @@ -0,0 +1,13 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Indexer.Models.Models +{ + public class TextTokens + { + public string[] Tokens { get; set; } + } +} diff --git a/Indexer/Models/Models/BlogEntry.cs b/Indexer/Models/Repos/BlogEntry.cs similarity index 86% rename from Indexer/Models/Models/BlogEntry.cs rename to Indexer/Models/Repos/BlogEntry.cs index e5f0c7c..4326c59 100644 --- a/Indexer/Models/Models/BlogEntry.cs +++ b/Indexer/Models/Repos/BlogEntry.cs @@ -4,7 +4,7 @@ using System.Linq; using System.Text; using System.Threading.Tasks; -namespace WebCrawler.Models +namespace Indexer.Models { public class BlogEntry { diff --git a/Indexer/Models/Models/IRepository.cs b/Indexer/Models/Repos/IRepository.cs similarity index 86% rename from Indexer/Models/Models/IRepository.cs rename to Indexer/Models/Repos/IRepository.cs index 476f4e7..0a7efec 100644 --- a/Indexer/Models/Models/IRepository.cs +++ b/Indexer/Models/Repos/IRepository.cs @@ -4,7 +4,7 @@ using System.Linq; using System.Text; using System.Threading.Tasks; -namespace WebCrawler.Models +namespace Indexer.Models { public interface IRepository { diff --git a/Indexer/WebCrawler/GamasutraCrawl.cs b/Indexer/WebCrawler/GamasutraCrawl.cs index 2b55032..8784b43 100644 --- a/Indexer/WebCrawler/GamasutraCrawl.cs +++ b/Indexer/WebCrawler/GamasutraCrawl.cs @@ -1,14 +1,16 @@ -using System; +using Indexer.Models; +using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; +using System.Text.Encodings.Web; using System.Text.Json; +using System.Text.RegularExpressions; using System.Threading; using System.Threading.Tasks; -using WebCrawler.Models; -namespace WebCrawler +namespace Indexer.WebCrawler { class GamasutraCrawl { @@ -22,12 +24,12 @@ namespace WebCrawler private readonly GamasutraParser parser = new GamasutraParser(); - public void Start() + public void Start(int maxPages) { Console.WriteLine("Downloading HTML page: " + url + urlSection); - GetLinks("/member/page="); - GetLinks("/expert/page="); + GetLinks("/member/page=", maxPages); + GetLinks("/expert/page=", maxPages); blogEntries.PrintRepo(); @@ -35,7 +37,7 @@ namespace WebCrawler } - private void GetLinks(string typeSection) + private void GetLinks(string typeSection, int maxPages) { int pageNum = 1; string html = null; @@ -55,29 +57,50 @@ namespace WebCrawler Console.WriteLine("No more pages."); html = null; } - } while (html != null && pageNum < 10); + } while (html != null && pageNum < maxPages); } private void ProcessEntries() { - foreach(var item in blogEntries.BlogEntryLinks) + foreach (var item in blogEntries.BlogEntryLinks) { - BlogEntry entry = new BlogEntry { url = item.Remove(0, 7)}; + BlogEntry entry = new BlogEntry { url = item.Remove(0, 7).Replace("\t","") }; StringBuilder fileName = new StringBuilder(); + fileName.Append(entry.url.Replace("/", "-")); - fileName.Append(".json"); - if (!File.Exists("downloaded/" + fileName)) + string html; + if (!File.Exists("downloaded_html/" + fileName)) { Console.WriteLine("Downloading entry HTML page: " + url + item); + html = download.GetHtml(url + item); + File.WriteAllText("downloaded_html/" + fileName, html); + } + else + { + html = File.ReadAllText("downloaded_html/" + fileName); + } + + parser.GetEntry(html, entry); + + fileName.Clear(); + fileName.Append(entry.url.Replace("/", "-")); + fileName.Append(".json"); + + if (!File.Exists("downloaded_json/" + fileName)) + { + + JsonSerializerOptions options = new JsonSerializerOptions + { + Encoder = JavaScriptEncoder.UnsafeRelaxedJsonEscaping, + WriteIndented = true + }; - string html = download.GetHtml(url + item); - parser.GetEntry(html, entry); - string json = JsonSerializer.Serialize(entry); + string json = JsonSerializer.Serialize(entry, options); - File.WriteAllText("downloaded/" + fileName, json); + File.WriteAllText("downloaded_json/" + fileName, json); Console.WriteLine("File " + fileName + " saved"); Thread.Sleep(500); diff --git a/Indexer/WebCrawler/GamasutraParser.cs b/Indexer/WebCrawler/GamasutraParser.cs index 95d8302..8e26700 100644 --- a/Indexer/WebCrawler/GamasutraParser.cs +++ b/Indexer/WebCrawler/GamasutraParser.cs @@ -1,12 +1,12 @@ using HtmlAgilityPack; +using Indexer.Models; using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; -using WebCrawler.Models; -namespace WebCrawler +namespace Indexer.WebCrawler { public class GamasutraParser { diff --git a/Indexer/WebCrawler/WebCrawler.csproj b/Indexer/WebCrawler/Indexer.WebCrawler.csproj similarity index 53% rename from Indexer/WebCrawler/WebCrawler.csproj rename to Indexer/WebCrawler/Indexer.WebCrawler.csproj index 324450c..fca20f7 100644 --- a/Indexer/WebCrawler/WebCrawler.csproj +++ b/Indexer/WebCrawler/Indexer.WebCrawler.csproj @@ -6,12 +6,11 @@ </PropertyGroup> <ItemGroup> - <PackageReference Include="HtmlAgilityPack" Version="1.11.31" /> - <PackageReference Include="System.Net.Http" Version="4.3.4" /> + <PackageReference Include="HtmlAgilityPack" Version="1.11.33" /> </ItemGroup> <ItemGroup> - <ProjectReference Include="..\Models\Models.csproj" /> + <ProjectReference Include="..\Models\Indexer.Models.csproj" /> </ItemGroup> </Project> diff --git a/Indexer/WebCrawler/Program.cs b/Indexer/WebCrawler/Program.cs index ac915eb..8763a82 100644 --- a/Indexer/WebCrawler/Program.cs +++ b/Indexer/WebCrawler/Program.cs @@ -3,20 +3,36 @@ using System; using System.Net.Http; using System.Threading.Tasks; -namespace WebCrawler +namespace Indexer.WebCrawler { class Program - { - - + { static void Main(string[] args) { Console.WriteLine("KIV/IR WebCrawler starting"); - System.IO.Directory.CreateDirectory("downloaded"); + System.IO.Directory.CreateDirectory("downloaded_json"); + System.IO.Directory.CreateDirectory("downloaded_html"); GamasutraCrawl crawler = new GamasutraCrawl(); - crawler.Start(); + + int maxPages = 10; + + if (args.Length > 0) { + + try { + maxPages = Int32.Parse(args[0]); + } + catch (FormatException) + { + Console.WriteLine($"Unable to parse '{args[0]}'."); + } + + } + + Console.WriteLine($"Going to process {maxPages} pages."); + + crawler.Start(maxPages); } } } diff --git a/Indexer/WebCrawler/Utils/HtmlDownloader.cs b/Indexer/WebCrawler/Utils/HtmlDownloader.cs index 44594d3..04442ae 100644 --- a/Indexer/WebCrawler/Utils/HtmlDownloader.cs +++ b/Indexer/WebCrawler/Utils/HtmlDownloader.cs @@ -6,7 +6,7 @@ using System.Net.Http; using System.Text; using System.Threading.Tasks; -namespace WebCrawler +namespace Indexer.WebCrawler { public class HtmlDownloader { -- GitLab From 131543086301299946c11cd56b3b4255607249bc Mon Sep 17 00:00:00 2001 From: Radek Mrvec <radek.mrvec@sumanet.cz> Date: Sun, 25 Apr 2021 21:02:14 +0200 Subject: [PATCH 4/6] refs #8332 Added Dependency injection and main Indexer clasess and interfaces --- .../.vs/Indexer/DesignTimeBuild/.dtbcache.v2 | Bin 78512 -> 83069 bytes Indexer/Indexer/Application.cs | 70 ++++++++++++++++++ Indexer/Indexer/Indexer.Indexer.csproj | 1 + Indexer/Indexer/Program.cs | 57 +++----------- Indexer/Indexer/Services/DictionaryService.cs | 12 +++ .../Indexer/Services/IDictionaryService.cs | 13 ++++ Indexer/Indexer/Services/IIndexerService.cs | 13 ++++ Indexer/Indexer/Services/IndexerService.cs | 14 ++++ Indexer/Models/Indexer.Models.csproj | 4 + Indexer/Models/Models/Dictionary.cs | 18 +++++ Indexer/Models/Models/Index.cs | 13 ++++ 11 files changed, 169 insertions(+), 46 deletions(-) create mode 100644 Indexer/Indexer/Application.cs create mode 100644 Indexer/Indexer/Services/DictionaryService.cs create mode 100644 Indexer/Indexer/Services/IDictionaryService.cs create mode 100644 Indexer/Indexer/Services/IIndexerService.cs create mode 100644 Indexer/Indexer/Services/IndexerService.cs create mode 100644 Indexer/Models/Models/Dictionary.cs create mode 100644 Indexer/Models/Models/Index.cs diff --git a/Indexer/.vs/Indexer/DesignTimeBuild/.dtbcache.v2 b/Indexer/.vs/Indexer/DesignTimeBuild/.dtbcache.v2 index c912c6df6fdd87ce37466d9e2240b2a99f4a3a71..fc9a77633dc400b98032d23164eb9536cd66fe9a 100644 GIT binary patch delta 6932 zcmeHJdvH|M9o};>0s@Kw45k8+%5*4n+4sYi67n!4B!rkicr1atckhM``w06WK#=7r zD2hogvWRaj)<;{FwS`t&YPI$yqJL{UZD;(u_sq_8=+xSN=iI%?-Am}`GE>{Bcjr6L z@BV(jbI$L6dtUkYx_=z4ARi2q_S$e*bo+gFyT>N_Lq3-;>~r{iqT3tvNj|U7>vnkj zL6_vV`5i8|+v}1<pXl_-L65`f$D6YB>QLr-vQF8%XU_UqL`o)7iE!E)iw5jgx78hp zMuLI3oKB_1ct}i!99CPPZIsp4YOPNsWosxJwRBxEytrnqoJ>U$@uha<{V9GDWIn=1 zA6BV96BE;0OMOW&Dfag}t#*^E{!AOl+*^04H*Lo3u<S)Oh-q=0MX_PC51B2huVhgn z&~`-Rfznl7bqAT%wV-o{>q^$8Pnn%|GN<iEr=QUmJE(S}3$x`Avz#r%^2#<@jHl|O znN(U%M&iAEHCv3XtY(VAx7Egei)-pL(R3!M{9(!>(#<02E+dlF^~<^Q&JyPsHTdKC zSoT^wiyuQ*QjElr;^VB<OfmQ#XRSSKt#TP_MFjCwIFXELF>EMV^4O%oJ6pLjT+@<o zpV&2lY+#O8#pQu?BF?}2zEmO}u(Q)}nm>IzhE|O+Tfo%z*)*uEnB=N-+Qg76EPH|u z(Id)o$ih9fAUR?&>=+cqs(%njUHr!L)>&k)0-I%Sonb&ZGw4&u_73H?BRA%SDHDm7 zFwLo5%!gG<gyc{n7I0ZzP^_dJHj7xN#;XUfAra%|J4(isEel<W>bW^TdFqS(U#~Ro z>=G`GmI|$!YMinxVLRG(<xyIy4Z&OW^nfM&m?c)t9FJo0GK<x0Rpakm{_LKQClX0{ z^Wbdd*q-T2uk>x++pCLsOt;cxv{JrIUsF^}{rJK3>ru6oM0G7X+oPNG8BKl;6;0>{ zn#(X?RzkkYpl<P7WmwcjZ#1m>GK}b1sq32Z`oXy@rOL52CO&*|Y2{BZtWnx0&dpCb z{2C#$zSd>erafbJuKs<Pv!Cgh@k3+EvJK${ASJtTNhy9ucb2at%Kqa$${(jq&HwYr zyoPx<tWG9+lOi-JWtp`m9f_s_tCMnnG9k&SR3fQ(-#uO@`hy<3<aIdwQqbXZ%3h~k z4m!Mkd&ut!`h22fbBEos7z#RNk0gs>(dCiClFjKAoubWe3qxMT4{XY(7iZ_&FMdj9 z&1{TkVyQrLXlJV!@6Cw4GQ2CTm%h574T6UVy5yZ{{(T{+nTD~h#5Ose%J*H`J~?|r zuvi`jP69dL6z~Xe8h8|V40s%P0(cU53V0frZn0RNLCxNO7AVKYtrknx@*TW<26ztm zF7Q0S488!o2)qP*5BNUt1K?%g72s9iERY9Y16Yth1bzgZ1I`0K27Us}L9oov>j3lh z2B7)NY7T#j0p?V9_E{bM4DVb3eh$0|m>u6#bo>j<G5c=;%m)jezneuvf7uxE+Zg{P zKwyA5VD&qwC!l68v(+*_>;J0Q|269C(f<u<=Ipls<Ngj{+{u_@ff$3Z8dsnP0FLEd z6c+*J;5{~gr?N(=_^^kZ$vr=dbWFOvz#etM$y-RZ-uYVYc0+HWV*A_Z)FP#0tn&?} z_M_36C5re_b!*X_og3vPj63TK>O9mg)mc!B3@2ys5;N{1!=t)K8SdPqDsn-OX?gCw zs<8;`bAPEK0e!AHU;Fp-H)k~)w2RPAQc&+;4z!2$`9s%`B55r_yOe~!M(Wl{>j>>O zk<iyj>vhrwLf3OrR+~Y|1~k@2ofRN-qsbKd8flYG>LGNKiG;pJ+N_hd5W3k!LLU$O zQ&DFH3EHLWLtir`>7)>$l8J=AMv`?>m{8e7LSG~G>ZGlN_GV42UY!-uS$%{?OeFL* zQ`^W08KzOvQ0&KuT1n%@S&U&yn_(qQfIUR}Q7UvhY2;(sq{dK%0!@(uO%s|aPVE?* z8Nkd=LU$Esc8|>rVrCDa_wYdWlDlY@(0%BfqWeh`JwWI|REG#XjOt!OkDxkA=zXZ} zC-fL;x|=>g=!57UC-fmyLxc|F^$|KkRuoWRJql|ntaSo2L-b)@o<#Y{bL!mYf;zWA zbEKJ`BB-$9bJ=NR@Vaa#X)drFM#`drJVjcffm}qUp@DQmcA<fkLI$CMv_Y1jfz&`E zpn=oJVWWW)#(|=NQ>4ADfrG-ipn=1nk06*~x|g&RP|(vDc{I!devF6uI7$V(yAo$m z-0lJ$;N9XL7w91G7B{v)pTH|8)ETXK<4MBGr+E1^FQ4I7pGEn}JLfJ`l7)gY|MIdd z{kBj`mjSiFa-a^V2O5Az;4YvESOGKxEx<~k6=(xi0qwwQpaZxY=mffeHNaY+8(0Ud z2Q~m3fdH^c$kx&xG&TcU01*fR5)cApAPn>ZTY-pByKmnmU0v}nrCJWCL>1Htv`@HP zsaCKZ-6qsw7o!9X8Wm8glW0s(Z=rF4CXlfhxK*uIzoJf8Z&O~ryeK!Tikwl-|Gw_r z=Go+>FU1Xs^S>=_Lk7_r`rInhHYzjMq%x<}`B0hpxy6gff~k#YHWt(-R84s+RVRGW zLUU)S&<ttXfscmCM!&$y?YvC#(jXph<RBh)ayB<-2C1rei`3G$^Vz@tnf&LKBn3f8 z0~z27CE0;Nqmm4uzZ2L6>;?vbJpfaddjPI1?-!NjLW#2MEh@_h%?ir_x=%m}QQ0r3 zH_!tDJqS_BE*8{j>W%76>U4UD3(#Q!4SFvt!9BtR=%_&N1NV@ibbQ$TrR-?84ED!~ z=l}CgHT>`HRAt|f$`P(R*e!|Qk4}$wKl-m$v*uDY8#Z5?_{E4G=2CV)FOTu^fpNuc sX|84lnfZlE+k;S~<A8Qa`ex}R=^;!QrELiPVSrthMgS&h3cw}pUlZ|h#{d8T delta 3249 zcmeH|-%k`*6vyX`3SvYoZJJ;zHf=2?Z7sVqv#@Pi7X=ly6<S0^7Tukl#UCKJfC!?y zBKTX}-9Z)+@kif$YP%1ljcNQBY|@uDzS)N&P43OSG>uLAy~6+t=>z@s#!Wuw+<VWt z_nga~!~Ah+*_S^rr3VSx?JEg~H8t!Cg-Sg^uS@j?G^JQ6Ho_rIRdn6!R*ZnVR1d0d zHDrXndeG23Ud62&n&CBcP4j*g@5rJ?v+VUslO+n%&vQLX%$}}l^Vh*u=_mQwl<s@w zVQc#L{%6_dv-`DamiV5`_9uSWyUp=Ye{y(bS{+(fUXWK6F<K)ny3y9w64|J?Nh_)% zEsc>tvl;nuz!y|Xk(awz)!c?B;8p^fj-*^#$Z&-{O4#Mmy&)wOR0DWT5Tn$kc}fCm zKsVf~swm#H@2BmuIrj98^u*JqoTwp7QW!LX!{7)w3XXv$&<u`)7SIY#fCyMENm3iM zINuKb$sGlf6qPz~_$25Ar$84F7j}cwpa+}*XF(Liz&X$h&VxSC4@8m|z(sHgTn1Ob zRj?Mxia6JRh;<!I#fwe_9KePMIvs4Gfo|YT93(&z%nzLB1U6w4_YVRQLnLpX6@jJ? zXJ&j8`$K?mfe0XU3VI2&I4oW({Kb0MS>J-r!g>T+1iKA{?<f$yIk1UDGLzVt`GV;L z;3M6E7y}}}U9o{tzt)WX(vW($h$^)8TP(KDve?^fy~W<LW(Jn_U`39@239oVVAf5n z$#$I0o{afe#{BWbnrP;Q?>iT|oD1E|HFKdbVQCMhI1VaP<~k^jgXTDRm^Rlza~w(> z2h;!j;Y^-+le0OyChK!EbE_P$-auc!6irfs*HHP?HGB`Td|pdaD@d4T<`&;eNs9A* zkO{t@DujjCO*t^x;s?m$^~C+Qy@9s!gTxPEG0X$BjR%SAP$A+5RG4@p)L}pIBhW{Q zAA@Ql-VAk|cnef3@e??n<Pq9#K_Im@F>4dEc39%P!<Hu@UyQKAN{ba*ypt;VDT3lP z;{`-LOQT(|CwK$xupoFh(mXA$=&=u<v5g04=j>xFeioa9?B!k9i;|ErTb{FJuYK-3 z<cn{4ANhDcxBxDKOW-oN0<MB<<g4e`F&O|iKpZ4M5}05R+yp})1%|;bFamCaQE&&0 zfxF-yxDOtHFUXgA{3f|lkIHD}_;?n*BX5bt?m0#O-D{G>q+Ui$W=r_T@^ATUnfOC& zk~ib^HG~QG$&hS0FO%7;e6!5A;CI}V`XPrri+v*Mo6Yt~Jfr^Of3M2_eUUGZbGyoi z$JY>@UO-95r)n5KvnbzkS;z`oZnb4b5r4CgBA!#j$*m~9|Gao7DY{Me@$H}zxPTid zKm|L12b6$PuoLV8pMVd+r{E*m7mM9@>c0~o&)NFl?bLsgSIK37SIdwn{m*2!f`2aa z8rgbJ?BudlERU_`d+au;l`-Lag@o@u(I)$4UWYc>lPV~peT!`}uT2&|C-@ipSDzDJ zYd78jTh`msKfe_pEua<szB#@4zkW|9sj>lWbr2MT`7L<}dzmd6z&Z#%06GwD83J}& F{t3-O-8ujO diff --git a/Indexer/Indexer/Application.cs b/Indexer/Indexer/Application.cs new file mode 100644 index 0000000..c2db838 --- /dev/null +++ b/Indexer/Indexer/Application.cs @@ -0,0 +1,70 @@ +using Indexer.Indexer.Services; +using Indexer.Indexer.Utils; +using Indexer.Models; +using Microsoft.Extensions.DependencyInjection; +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Text.Json; +using System.Threading.Tasks; + +namespace Indexer.Indexer +{ + public class Application + { + + private readonly IIndexerService _indexer; + private readonly IDictionaryService _dictionary; + + public Application(IIndexerService indexer, IDictionaryService dictionary) => (_indexer, _dictionary) = (indexer, dictionary); + + public void Run(string[] args) + { + foreach (string path in args) + { + if (File.Exists(path)) + { + // This path is a file + ProcessFile(path); + } + else if (Directory.Exists(path)) + { + // This path is a directory + ProcessDirectory(path); + } + else + { + Console.WriteLine("{0} is not a valid file or directory.", path); + } + } + } + + public static void ProcessDirectory(string targetDirectory) + { + // Process the list of files found in the directory. + string[] fileEntries = Directory.GetFiles(targetDirectory); + foreach (string fileName in fileEntries) + ProcessFile(fileName); + + // Recurse into subdirectories of this directory. + string[] subdirectoryEntries = Directory.GetDirectories(targetDirectory); + foreach (string subdirectory in subdirectoryEntries) + ProcessDirectory(subdirectory); + } + + // Insert logic for processing found files here. + public static void ProcessFile(string path) + { + string contents = File.ReadAllText(path); + BlogEntry entry = JsonSerializer.Deserialize<BlogEntry>(contents); + + Preprocessor processor = new(); + HashSet<string> tokens = processor.Process(entry); + + Console.WriteLine("Processed file '{0}'. Number of tokens {1}", path, tokens.Count); + } + } +} + diff --git a/Indexer/Indexer/Indexer.Indexer.csproj b/Indexer/Indexer/Indexer.Indexer.csproj index 4b89db9..5d248e9 100644 --- a/Indexer/Indexer/Indexer.Indexer.csproj +++ b/Indexer/Indexer/Indexer.Indexer.csproj @@ -11,6 +11,7 @@ <ItemGroup> <PackageReference Include="HtmlAgilityPack" Version="1.11.33" /> + <PackageReference Include="Microsoft.Extensions.DependencyInjection" Version="5.0.1" /> <PackageReference Include="Microsoft.ML" Version="1.5.5" /> <PackageReference Include="StemmersNet.Standard" Version="1.1.1" /> </ItemGroup> diff --git a/Indexer/Indexer/Program.cs b/Indexer/Indexer/Program.cs index 4e11e51..f8aa3ce 100644 --- a/Indexer/Indexer/Program.cs +++ b/Indexer/Indexer/Program.cs @@ -1,5 +1,8 @@  +using Indexer.Indexer.Services; +using Indexer.Indexer.Utils; using Indexer.Models; +using Microsoft.Extensions.DependencyInjection; using System; using System.Collections.Generic; using System.IO; @@ -10,56 +13,18 @@ namespace Indexer.Indexer class Program { - static void Main(string[] args) { - new Program().Run(args); - } - - public void Run(string[] args) - { - foreach (string path in args) - { - if (File.Exists(path)) - { - // This path is a file - ProcessFile(path); - } - else if (Directory.Exists(path)) - { - // This path is a directory - ProcessDirectory(path); - } - else - { - Console.WriteLine("{0} is not a valid file or directory.", path); - } - } - } + IServiceCollection serviceCollection = new ServiceCollection(); + serviceCollection.AddScoped<Application>(); + serviceCollection.AddSingleton<IIndexerService, IndexerService>(); + serviceCollection.AddSingleton<IDictionaryService, DictionaryService>(); - public static void ProcessDirectory(string targetDirectory) - { - // Process the list of files found in the directory. - string[] fileEntries = Directory.GetFiles(targetDirectory); - foreach (string fileName in fileEntries) - ProcessFile(fileName); + var serviceProvider = serviceCollection.BuildServiceProvider(); - // Recurse into subdirectories of this directory. - string[] subdirectoryEntries = Directory.GetDirectories(targetDirectory); - foreach (string subdirectory in subdirectoryEntries) - ProcessDirectory(subdirectory); + var app = serviceProvider.GetService<Application>(); + app.Run(args); } - // Insert logic for processing found files here. - public static void ProcessFile(string path) - { - string contents = File.ReadAllText(path); - BlogEntry entry = JsonSerializer.Deserialize<BlogEntry>(contents); - - Preprocessor processor = new(); - HashSet<string> tokens = processor.Process(entry); - - Console.WriteLine("Processed file '{0}'. Number of tokens {1}", path, tokens.Count); - } - } + } } diff --git a/Indexer/Indexer/Services/DictionaryService.cs b/Indexer/Indexer/Services/DictionaryService.cs new file mode 100644 index 0000000..f4f3b16 --- /dev/null +++ b/Indexer/Indexer/Services/DictionaryService.cs @@ -0,0 +1,12 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Indexer.Indexer.Services +{ + public class DictionaryService : IDictionaryService + { + } +} diff --git a/Indexer/Indexer/Services/IDictionaryService.cs b/Indexer/Indexer/Services/IDictionaryService.cs new file mode 100644 index 0000000..c0123fb --- /dev/null +++ b/Indexer/Indexer/Services/IDictionaryService.cs @@ -0,0 +1,13 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Indexer.Indexer.Services +{ + public interface IDictionaryService + { + + } +} diff --git a/Indexer/Indexer/Services/IIndexerService.cs b/Indexer/Indexer/Services/IIndexerService.cs new file mode 100644 index 0000000..292a712 --- /dev/null +++ b/Indexer/Indexer/Services/IIndexerService.cs @@ -0,0 +1,13 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Indexer.Indexer.Services +{ + public interface IIndexerService + { + + } +} diff --git a/Indexer/Indexer/Services/IndexerService.cs b/Indexer/Indexer/Services/IndexerService.cs new file mode 100644 index 0000000..1bb88f9 --- /dev/null +++ b/Indexer/Indexer/Services/IndexerService.cs @@ -0,0 +1,14 @@ +using Indexer.Indexer.Services; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Indexer.Indexer.Utils +{ + public class IndexerService : IIndexerService + { + + } +} diff --git a/Indexer/Models/Indexer.Models.csproj b/Indexer/Models/Indexer.Models.csproj index 6e9ceba..67d4c19 100644 --- a/Indexer/Models/Indexer.Models.csproj +++ b/Indexer/Models/Indexer.Models.csproj @@ -4,4 +4,8 @@ <TargetFramework>net5.0</TargetFramework> </PropertyGroup> + <ItemGroup> + <PackageReference Include="System.Collections.NonGeneric" Version="4.3.0" /> + </ItemGroup> + </Project> diff --git a/Indexer/Models/Models/Dictionary.cs b/Indexer/Models/Models/Dictionary.cs new file mode 100644 index 0000000..0fe3276 --- /dev/null +++ b/Indexer/Models/Models/Dictionary.cs @@ -0,0 +1,18 @@ +using System; +using System.Collections; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Indexer.Models.Models +{ + public class Dictionary + { + private Hashtable content; + + public Dictionary() { content = new Hashtable(); } + + public Hashtable Content { get; } + + } +} diff --git a/Indexer/Models/Models/Index.cs b/Indexer/Models/Models/Index.cs new file mode 100644 index 0000000..f429c9f --- /dev/null +++ b/Indexer/Models/Models/Index.cs @@ -0,0 +1,13 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Indexer.Models.Models +{ + public class Index + { + + } +} -- GitLab From fa4af9abb7c6c93ac2634970fcf4f1c4a3457dc7 Mon Sep 17 00:00:00 2001 From: Radek Mrvec <radek.mrvec@sumanet.cz> Date: Mon, 26 Apr 2021 22:47:33 +0200 Subject: [PATCH 5/6] refs #8332 Dictionary created during preprocess @1 --- .../.vs/Indexer/DesignTimeBuild/.dtbcache.v2 | Bin 83069 -> 84517 bytes Indexer/Indexer/Application.cs | 57 +++++++---------- Indexer/Indexer/Services/DictionaryService.cs | 14 +++++ .../Indexer/Services/IDictionaryService.cs | 2 + Indexer/Indexer/Utils/FileProcessor.cs | 58 ++++++++++++++++++ Indexer/Indexer/Utils/Preprocessor.cs | 14 ++++- 6 files changed, 108 insertions(+), 37 deletions(-) create mode 100644 Indexer/Indexer/Utils/FileProcessor.cs diff --git a/Indexer/.vs/Indexer/DesignTimeBuild/.dtbcache.v2 b/Indexer/.vs/Indexer/DesignTimeBuild/.dtbcache.v2 index fc9a77633dc400b98032d23164eb9536cd66fe9a..a73f8f09a5c27f79e97a662a60f47790b4b5a34c 100644 GIT binary patch delta 5551 zcmeHJdvH|M9o}=3Km@`oP)Ic4ZPE^G?%ns|P!ryfXCy=(8{qEVyBG+&VRxY*=mrFZ z@GzSb2q5y%Qmr$cvF_N|N;S1FTZ$E01yLm6gF)!@p6)-jwKM(B-MgC&qY@eIU%fNm z`F-d7zVo=}p5O1>|Ln*)(V9a(iII8bUbiAkK1Fn^R@LE@txlIBxg4tOcl&Ij52b8% z`z5F9@>�wA*}&Q}rpbY;(BW7Kg<eYc3>fX?f>zns+Ime)d60cGi@M9y;RE7`mMF zrjsunq@R%v;-znO&ZhG&=2Ee9Ku&ffv`(ep?~LVSo0a-{T5)L-?R&X4J~8{0KvvR< zk(E@7700jVIkM>HjaBq-t^MO?2KOfMZ9|@Ki2rN*@@yKrJdW<UToXUll%GXEI$jgk zVrL1BoT-KQA=-Ypw&g&-xNZOD3NoJl^jtBYmJYgNr+MdRlUi!N_+8p?#U6j@{1HM= zTq()zGd~nu8Isq~H?9=X^%pAW$IxE8FucOzblL1apUvy{OJ3FIwfZfhtoXc=;&i)I zuS4<5Zb@=ECCTm;T|U)nQ5}AV*NN>BeOA%58hazsk=eQQC>f-0KrkzpmXdrrjEtq_ zWCW?qP%NWUIwwF~U?}?$Plh+u-;(ZANdJ*FQ#e6d`jtrPhKL#n*98M%v!XVr0Ywe? zHr54JslEsr&9XNf3CTJSd+cV>EYdA~i&sqR>NPdpt1{iIBBK}oRHhAc>U^PKIOva< zd(@S740Y$rir4LQTU{2nqR6U4G(E~!9R2DQ>x+~;^>toPK#gF<o*qxl7rOeE>fy%L zj2~H}hWN%!(znGd(W)V%2j0A9x;FRzcunsKz3ACN#qoOwSqZJ`TTGkx-9x1z3%urA zPG`(k#wU?XIZ0F6({83&*s-xa*)cj{`)L|&8cjDf%@<xERE$|}?`&9l^nsHnN<SNW z^~o0XWRbV?S?N#q%JGSNS7g!G>6^$X$99+M+yBD3TCzkBX&zd;&VMi3zoMa`zRoA> z2j3Sq6)jLh>+5`K*fX_GU#}e6Xf&DOE>62EztDeYm%kEWwLuMKbS~_2rhKFH$@s`a z7e?yp!=Cve75N48hl3&BEYilNQS{vl11rj{HmhQD$_~-#mHl?fW%o;p=(PG&(I&ZU zqUcn;Hm~S*`0a`)JFKcA*=;UWl_i@?bjzY6UjAM_p=?Ng!Q5bAh8j>qb-ps2*=iO| zIZ0E*z5TkprUgi_!)b$%?VP)Ah8meJ*VlVx-|FeXdPNO+!W+X8b&c5<tglaIj<7ip z46IC!?9qpeSG^w~V}{HLDk@&CYs>hn4KP~ujiRqy+!-(Ya8eejMjAt)TrChxbJH2E z`4gp2o+<hCH_;ncEkj=tOs1a!1JT+J`(xl``uD44(UhkEo}Z)j7r-mPN#K{j%fPF^ zYd{C^E8unD*T8Rp-vYk_eh<6>`~i3o_#<!%_!Dp%coX<DP-M*W7C7dLn$Cb^PIwk| zK51&Qo=)@^^!Y3BHt-H`^TY#;iO<2u$3G8r2+`!~Z|<DA;&&mu01ynoC*bxy*eqE7 z=KO8Bp4Z<u>K9>qq5c6ZpX?IAwVeRh-T@z9$ayyk{-)fSxro#P&}_Pl;tIeg_>ec? zQHp5ewPkJNOUN|)xG4#3L?WTxt&K})V-x8IRsGRQ3_+<@k`~Y<pBDBu#B_58Ka#;a zJ}r!94p?RkC>aA<G$}nGccH33n$-~4G;4Q()ezVXfkU%*7uXGf(-6?w>(8gxIp9Xu znxz|_yv(&#wpW#s7Y9XKNQ_mJsmWp3LZS_3H6&R>30ImY*aM`6G_yr0V{9>*#yxCF zQlJS5wv;58hp<}Rzl=<0%L!Y7$^j;m8O%$V4@MzOh4B-%66V2L!s=jG5w;qpp0G7A z0m6bX4TP;l_ZAi+GZQGVv@mC3&LZ$Mvvqp89_8nUwBpKyR-9lzAeC$bfnn9{1%zzQ zj&6iM#+H%$5-8Y1Smq`^#Adzo!@6@Rnbq|&3HAsYTebYzXp0hFHtOXTz1*tz!_Ncx z{3P2(%Gq|{G2n4v2e1=p0(JqrNx6sZ0oe;Y0W<?Kpaq~nE3gmP53~UXfP=sx;7Q;x z@Dy+aI0_sCjss5v&ye!Ajxj`PJ6S>cwYT>oRw3Hh_>pnw-x+B#X{N0j(F84vl?(qI ztmW|StPm;yHd#Qa<+3S))`v|M*fe}XG~{XhwE@~dHeHyEJ(?kau$jDsv{KN%!|oH< zEJ)E%pcQI&Y6G>q@F}z+Y{#=&F}-}PBFbh9$@9qO2;d2Alzxgh<!2^oK<1`o^91m; z43x`~24sFpRwaO^WuRP^G$8k<WYGlzDrp@k*Ch=|bxOBT08h(6xh!cwYErTX1n{&B zl*^I^WKl}CSOC{$Q7!=QGN>;}>6QxM>ES@RENMVIDOs%mo|b`fS<-+kOUafC;At5s z9zSo=AGRW;lLhd!4ipb>(I4#_&xzgg;DqEX%$r(_PXJFZ1{4o((I2LyWU2t3mVt6v z(ja-387WWH%DU%7o_6^s&~BA?nLjntN&!4Q6DXf4X)w-hq-TskXI_8pe~^k>zakYs zC?u0~U2<!Al3oR@7FfOES<~f-2H*+`tif=t?Q(_S2@78@vw!Wx9{sY+{@fR2cA||& zwja%NNu{1pkS#!vGLR2IkQ5N`Ac%T|Gzj7vK?{P&MR0;3LJ@Z$2tGvCFfhayVh02P zgy;ZSA+(JoWNaypCu%sOIDjDhT$W)4IC>I{#7_&F{X%;cAya#B<+S3u>~`g>5_r}4 zQVAB&tH$TS?Wb3bPh#9pNY4Cfis<D!y<D%CTgfzjIqEmm>7LwBUvfdU?Yf&3`W_a_ z*&{#{Xau$ZUvzP8Mf;;b>H^yaep{E=4HDgszK;PSAORM@3fO?h0SDj&T)+<C9$+WX z1ndHK1ABl;z+T{9p}ev2W8+@@_g$Iy;uFTb7-P-)jaN-r^Du$M1k`cmweTyC3ak}h zzPat~JBZ`|Kae~-YF$5&&E0+s_ZjJ9e^>fw!`>bEmeR-QTmBdxXfGsWQx8Hi#@h5K zGgh98vTFUO@1P!8hxGDEBg)c2_G%A;Y#~l0e%VsLf#1sS-wm?(Fi!hZz!Bi)$T^C( e%*Z*0`f=cC;2GeDzzN_-z_S34p638Pdj1P`x@qSC delta 4884 zcmeH}dvH|M9mn@~cS8b%Jczc1k(kmT#%kC{_GPuRAwsemCCEGE!S3DN5C};~Kv0k+ zKp;M%n;dWv<)y`H?X*L8+GvZS)!LHg(fWsFtfo_R)bRoLc>n1icIfxqdv}xUbez%7 zR6F&~e17Nr&iS5m@1Aq=%{g^2<DWel^yfZW;)_HAPOryib6Er4kf+QO@z}kdfYTlH z$R4-H?X<hR!7|xt_1epvPIs9c@B|#5aL{FUcrh|8_r+$@Ms|6xl^q?K#ZGi(v6{EC z*vPPZin-#(I@28X;_#fQri{`Joo%fqzagB>+D58bu5^a#jFv2>jLgflZ0Xn<HYMAV z(`B>H(Q7g-rJ<IV)C@wO&ss;^qD;t^At_P*`mQO|#HioTin<FE@l2OFarXRDq6W0p zuq6j_64y`9B-Z;zAv=EI2dw$cWeG?2PukgrzYZnd-q~oOV(}5I?yc&?Kv#j8y);&r zs2W>mW*6AXL{azaL}B*w*tP8K^FL&{7n<0o`~N~gtp(H3Hg>vaJ&R@MBz!%s<x<H4 z_V;sFvc}OXSjIbcmOofcwX9*VL|dNbcdkpE89YXX(`q`}nmPiTO65+|l`FP1w{+Ik zbcEYG+T?I&XB#^?w3yl7wX>^+zf<lFx@@xBZuiPTyT=iBJ8a>g-R-r7yk$X;Cm>s$ z5ob6M3Od3rIUI-t%3N|pwmRGaN5JZ}MohI>X*U~t_f6*epeV8Zy=r0~z29Bo3tR1; zGP~@JcxBm+DX{NBw;YJrJPz3zj#xbrN6_tdIAy0CaoX`7t1}b~+3e0pnbqa>I!&9k zExbrqVfPDIJ<VX3ho`VPG@puCA=peuuuS5FT%_qN(|m0z(?=XKTUiI)#4;`QG#@)K zixru3S-9&?>3V$)qk{$P{5dbHA8cf`L-Seh&@6Ho!s#q~I4{xu!Odp6BXyK+`Zy$+ zmTT$QN5#<@CjH4laZoau4uMBN92^FZf+OHD@HjXMj)CLg1UL!uF!l+QV*Hcf3(Rp% zYLZ!)<VP5M3OogV44wvJ!e<!&c!6KM`j_BWU;sP^o(I1MbBxiyL0!yn227mg1>|C; zXTeVyeRB2G)NHK$lY)};`Erf<ert?-#Ta{9sG}3#{3?2W2M8n18s%#!&B#xq6z>sw zk^kPv|A2A|@^dJ~R}BK8{UZ?Cspu1LPQ4ZN7lD}nCCT)?)K5ol8bTNb;sqn30nf}L z_Qu5qIu#$trV6(GQcl))Rq-hESns9T$z(qJ*?3aIM#q!yGi5xvf#oZyF$>~9&!tpm zi?@}IXO4JmLgtL0E~T@{ca_I4=Zz~X;zjvXm+bQ={>Gonk5;J4)vC&u6JJKEQmlQj z7DTRBR*)fDNqj|`2)Qm=l@zTezA8<GTo<iLifW0k(L_<b2T?TWs<lZ~9r3kkQz6$y z^+{0!@%l6oa$VGz6s;rPm?lDw2k+;Bq$)_LPhJG%`c!gK6e2FCiID4}a8eW@9!?V> z*F{Z9(R$)d(KJ<4Qq`PPZ6My9CPJ=HwUPR%kGIe*M!t!ZnY`8LMH{;G9uavP)G^+U z$oOVjq_ssmbcG_RypvSEh4@yZ>#p&hZRpuf{BEP?p7EX?=-Em9UhPA6(PADYz8lG5 z-bIys5AnSy_7U$!v7h(>6g|Z6LvcUx2dHv2e~@@D(uasYj3P$755xO;KUJv+_&rS2 zn5Z2@PmCYZ;v<NkJ*5=-Ri#knaq{!SgaW@<nTxi$@zG)`9yn&9rIzS2@~h%F<Mg6} zBa0J?3XUZX9V$3^IB2Ngbm1_ef)j)Tg9=Uw&IBqr2-tU2u-VvKRIs7ENmQ_B*dkQ0 zFZ@w_S0CR+x2g#I2wEPCh;M&f`|6{J3=4f5w!>I?m2cBhW4TqnLraY{R{1duIj9s= zW5jV1@q`vnYViqe)+Z4^du`yvbyOn7)#+3j<vz*B%fUiW0d4_{z+zAds(>Hd3T^|{ z;C65aSORLmQgA0&29|>rU?o@uR)aO57OVwzpdK`UMybWe*P#*sK_G(=2!jY{0_#Du z<lDV_H0hrBZ=#s?D5OY=nQxH(F;mG9OSw^6h~;Y`RCtSosATa?lJYIyD)Ba)+KutX z8^{%3oCo7QwSX25WLfFiZ+s00<$v4PViWu-_Js}R_LRf?M+S4)Fq7vkG%tG*d}WcU zRHCRHxNITC{8c;4jgy@vrke%+ScFfsOCoO8VuuzdxW$<`+`>k`h*K4PJL47d@mB|G zrqPG=-?NTRIKURL6^vU)RJV@1&^XaLwjtjR?gsaO9bhLAR&p=UtmIFIm7JYqCA$nO z>E}_Y9Pr%|B3z_PQl{}e65k6KLBk9sN6A%+^Y}i^ak?c`_<j+gJs=#XN8<NEyH8?s zFRl8<ag3{l_#cVq|MMo*_jxy|FMTWeGM-f|?nc_J=*Z{XioT{s=AUe2ed$*u?GnVh zH5a>Iiw|h=!GG&(H^(hibj4TWYP~R{hrr|ukwbU2htV<7)ndr|fVdd-1L0;2Xm0i| DCb6f> diff --git a/Indexer/Indexer/Application.cs b/Indexer/Indexer/Application.cs index c2db838..1e80d09 100644 --- a/Indexer/Indexer/Application.cs +++ b/Indexer/Indexer/Application.cs @@ -18,52 +18,39 @@ namespace Indexer.Indexer private readonly IIndexerService _indexer; private readonly IDictionaryService _dictionary; + private List<string> files = new(); + public Application(IIndexerService indexer, IDictionaryService dictionary) => (_indexer, _dictionary) = (indexer, dictionary); public void Run(string[] args) { - foreach (string path in args) - { - if (File.Exists(path)) - { - // This path is a file - ProcessFile(path); - } - else if (Directory.Exists(path)) - { - // This path is a directory - ProcessDirectory(path); - } - else - { - Console.WriteLine("{0} is not a valid file or directory.", path); - } - } - } + FileProcessor fileProcessor = new(); + List<string> files = fileProcessor.GetFiles(args); - public static void ProcessDirectory(string targetDirectory) - { - // Process the list of files found in the directory. - string[] fileEntries = Directory.GetFiles(targetDirectory); - foreach (string fileName in fileEntries) - ProcessFile(fileName); + Console.WriteLine($"Going to preprocess {files.Count} files."); - // Recurse into subdirectories of this directory. - string[] subdirectoryEntries = Directory.GetDirectories(targetDirectory); - foreach (string subdirectory in subdirectoryEntries) - ProcessDirectory(subdirectory); + PreprocessFiles(files); } - // Insert logic for processing found files here. - public static void ProcessFile(string path) + private void PreprocessFiles(List<string> files) { - string contents = File.ReadAllText(path); - BlogEntry entry = JsonSerializer.Deserialize<BlogEntry>(contents); + int tokensCount = 0; + foreach (var item in files) + { + string content = File.ReadAllText(item); - Preprocessor processor = new(); - HashSet<string> tokens = processor.Process(entry); + HashSet<string> tokens = GetTokens(content); + tokensCount += tokens.Count; + } + Console.WriteLine($"All files has been preprocessed and {tokensCount} tokens created."); + Console.WriteLine($"Dictionary with {_dictionary.GetLastId()} words created."); + } - Console.WriteLine("Processed file '{0}'. Number of tokens {1}", path, tokens.Count); + private HashSet<string> GetTokens(string content) + { + BlogEntry entry = JsonSerializer.Deserialize<BlogEntry>(content); + Preprocessor processor = new(_dictionary); + return processor.Process(entry); } } } diff --git a/Indexer/Indexer/Services/DictionaryService.cs b/Indexer/Indexer/Services/DictionaryService.cs index f4f3b16..17ff9cc 100644 --- a/Indexer/Indexer/Services/DictionaryService.cs +++ b/Indexer/Indexer/Services/DictionaryService.cs @@ -8,5 +8,19 @@ namespace Indexer.Indexer.Services { public class DictionaryService : IDictionaryService { + private Dictionary<string, int> idxDictionary = new(); + + public void AddToDictionary(string key, int value) + { + if (!idxDictionary.ContainsKey(key)) + { + idxDictionary.Add(key, value); + } + } + + public int GetLastId() + { + return idxDictionary.Count; + } } } diff --git a/Indexer/Indexer/Services/IDictionaryService.cs b/Indexer/Indexer/Services/IDictionaryService.cs index c0123fb..ddd4d0c 100644 --- a/Indexer/Indexer/Services/IDictionaryService.cs +++ b/Indexer/Indexer/Services/IDictionaryService.cs @@ -8,6 +8,8 @@ namespace Indexer.Indexer.Services { public interface IDictionaryService { + public void AddToDictionary(string key, int value); + public int GetLastId(); } } diff --git a/Indexer/Indexer/Utils/FileProcessor.cs b/Indexer/Indexer/Utils/FileProcessor.cs new file mode 100644 index 0000000..50d6994 --- /dev/null +++ b/Indexer/Indexer/Utils/FileProcessor.cs @@ -0,0 +1,58 @@ +using Indexer.Models; +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Text.Json; +using System.Threading.Tasks; + +namespace Indexer.Indexer.Utils +{ + public class FileProcessor + { + + private readonly List<string> files = new(); + + public List<string> GetFiles(string[] args) + { + foreach (string path in args) + { + if (File.Exists(path)) + { + // This path is a file + CollectFile(path); + } + else if (Directory.Exists(path)) + { + // This path is a directory + ProcessDirectory(path); + } + else + { + Console.WriteLine("{0} is not a valid file or directory.", path); + } + } + return files; + } + + + private void ProcessDirectory(string targetDirectory) + { + // Process the list of files found in the directory. + string[] fileEntries = Directory.GetFiles(targetDirectory); + foreach (string fileName in fileEntries) + CollectFile(fileName); + + // Recurse into subdirectories of this directory. + string[] subdirectoryEntries = Directory.GetDirectories(targetDirectory); + foreach (string subdirectory in subdirectoryEntries) + ProcessDirectory(subdirectory); + } + + private void CollectFile(string path) + { + files.Add(path); + } + } +} diff --git a/Indexer/Indexer/Utils/Preprocessor.cs b/Indexer/Indexer/Utils/Preprocessor.cs index 3ccff14..7ea737c 100644 --- a/Indexer/Indexer/Utils/Preprocessor.cs +++ b/Indexer/Indexer/Utils/Preprocessor.cs @@ -1,4 +1,5 @@ -using Indexer.Models; +using Indexer.Indexer.Services; +using Indexer.Models; using Indexer.Models.Enums; using Indexer.Models.Models; using Iveonik.Stemmers; @@ -16,6 +17,10 @@ namespace Indexer.Indexer public class Preprocessor { + private readonly IDictionaryService _dictionary; + + public Preprocessor(IDictionaryService dictionary) => (_dictionary) = (dictionary); + public HashSet<string> Process(BlogEntry entry) { string content = RemoveTags(entry.Content); @@ -80,9 +85,14 @@ namespace Indexer.Indexer private HashSet<string> StemmTokens(IStemmer stemmer, params string[] words) { HashSet<string> stems = new HashSet<string>(); + int id = 0; foreach (string word in words) { - stems.Add(stemmer.Stem(word)); + string stem = stemmer.Stem(word).Trim(); + stems.Add(stem); + id = _dictionary.GetLastId(); + id++; + _dictionary.AddToDictionary(stem, id); } return stems; } -- GitLab From 9a127a72235d8e25ad54a6c04defbbcfb4d9076f Mon Sep 17 00:00:00 2001 From: Radek Mrvec <radek.mrvec@sumanet.cz> Date: Sun, 2 May 2021 19:16:01 +0200 Subject: [PATCH 6/6] refs #8332 Indexing engine --- Indexer/Indexer/Application.cs | 18 ++++++++--- Indexer/Indexer/Services/DictionaryService.cs | 5 ++++ .../Indexer/Services/IDictionaryService.cs | 2 ++ Indexer/Indexer/Services/IIndexerService.cs | 2 ++ Indexer/Indexer/Services/IndexerService.cs | 30 +++++++++++++++++++ 5 files changed, 53 insertions(+), 4 deletions(-) diff --git a/Indexer/Indexer/Application.cs b/Indexer/Indexer/Application.cs index 1e80d09..fa29a9d 100644 --- a/Indexer/Indexer/Application.cs +++ b/Indexer/Indexer/Application.cs @@ -27,25 +27,35 @@ namespace Indexer.Indexer FileProcessor fileProcessor = new(); List<string> files = fileProcessor.GetFiles(args); - Console.WriteLine($"Going to preprocess {files.Count} files."); + Console.WriteLine($"Going to process {files.Count} files."); - PreprocessFiles(files); + ProcessFiles(files); } - private void PreprocessFiles(List<string> files) + private void ProcessFiles(List<string> files) { int tokensCount = 0; + Console.WriteLine($"Starting processing and indexing files."); foreach (var item in files) { string content = File.ReadAllText(item); HashSet<string> tokens = GetTokens(content); - tokensCount += tokens.Count; + IndexFile(tokens, files.IndexOf(item)); + tokensCount += tokens.Count; } Console.WriteLine($"All files has been preprocessed and {tokensCount} tokens created."); Console.WriteLine($"Dictionary with {_dictionary.GetLastId()} words created."); } + private void IndexFile(HashSet<string> tokens, int file) + { + foreach (var item in tokens) + { + _indexer.AddToIndex(_dictionary.GetWordId(item), file); + } + } + private HashSet<string> GetTokens(string content) { BlogEntry entry = JsonSerializer.Deserialize<BlogEntry>(content); diff --git a/Indexer/Indexer/Services/DictionaryService.cs b/Indexer/Indexer/Services/DictionaryService.cs index 17ff9cc..43f392e 100644 --- a/Indexer/Indexer/Services/DictionaryService.cs +++ b/Indexer/Indexer/Services/DictionaryService.cs @@ -18,6 +18,11 @@ namespace Indexer.Indexer.Services } } + public int GetWordId(string word) + { + return idxDictionary.GetValueOrDefault(word); + } + public int GetLastId() { return idxDictionary.Count; diff --git a/Indexer/Indexer/Services/IDictionaryService.cs b/Indexer/Indexer/Services/IDictionaryService.cs index ddd4d0c..8d38c79 100644 --- a/Indexer/Indexer/Services/IDictionaryService.cs +++ b/Indexer/Indexer/Services/IDictionaryService.cs @@ -10,6 +10,8 @@ namespace Indexer.Indexer.Services { public void AddToDictionary(string key, int value); + public int GetWordId(string word); + public int GetLastId(); } } diff --git a/Indexer/Indexer/Services/IIndexerService.cs b/Indexer/Indexer/Services/IIndexerService.cs index 292a712..d80329e 100644 --- a/Indexer/Indexer/Services/IIndexerService.cs +++ b/Indexer/Indexer/Services/IIndexerService.cs @@ -8,6 +8,8 @@ namespace Indexer.Indexer.Services { public interface IIndexerService { + public void AddToIndex(int word, int file); + public void ExportIndex(); } } diff --git a/Indexer/Indexer/Services/IndexerService.cs b/Indexer/Indexer/Services/IndexerService.cs index 1bb88f9..e29e706 100644 --- a/Indexer/Indexer/Services/IndexerService.cs +++ b/Indexer/Indexer/Services/IndexerService.cs @@ -9,6 +9,36 @@ namespace Indexer.Indexer.Utils { public class IndexerService : IIndexerService { + private Dictionary<int, LinkedList<int>> idx = new(); + + public void AddToIndex(int word, int file) + { + if(idx.ContainsKey(word)) + { + var item = idx.GetValueOrDefault(word); + item.AddLast(file); + } + else + { + LinkedList<int> postList = new LinkedList<int>(); + postList.AddLast(file); + idx.Add(word, new LinkedList<int>(postList)); + } + } + + + public void ExportIndex() + { + foreach(var item in idx) + { + Console.Write($"{item.Key} --> "); + foreach(var file in item.Value) + { + Console.Write($"{file} "); + } + Console.Write("\n"); + } + } } } -- GitLab