using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text.RegularExpressions; using System.Xml; namespace ConsoleApp1 { class Program { static void Main() { new Program().Run(); } private void Run() { Regex r = new Regex(@".+\\(\w+)\\(\w+).evkjkv.html", RegexOptions.Compiled | RegexOptions.IgnoreCase); List<string> parties = new List<string>() { "FIDESZ", "JOBBIK", "MSZP", "LMP", "EGYÜTT", "DK", }; foreach (var file in Directory.GetFiles( @"C:\temp\valasztas\valasztas.hu\dyn\pv18\szavossz\hu\", "evkjkv.html", SearchOption.AllDirectories)) { var d = FromHtml(file); IEnumerable<string> cols = parties.Select(p => { var part = GetVotes(d, file, p); return $"{p},{part}"; }); string res = string.Join(",", cols); var m = r.Match(file); Console.WriteLine($"{m.Groups[1]},{m.Groups[2]},{res}"); } } private static string GetVotes(XmlDocument d, string file, string party) { var n = d.SelectSingleNode($"//tr[td[starts-with(text(), '{party}')]]"); if (n == null) { return "0"; //Console.WriteLine($"Skipping {file} because there is no data for {party}"); } //Console.WriteLine(n.InnerXml); return n.SelectSingleNode("td[4]").InnerText.Replace("&", "").Replace(" ", ""); } XmlDocument FromHtml(string path) { using (TextReader reader = File.OpenText(path)) { XmlDocument doc; using (var sgmlReader = new Sgml.SgmlReader { DocType = "HTML", WhitespaceHandling = WhitespaceHandling.All, CaseFolding = Sgml.CaseFolding.ToLower, InputStream = reader }) { doc = new XmlDocument { PreserveWhitespace = true, XmlResolver = null }; doc.Load(sgmlReader); } return doc; } } } }
Could you hire me? Contact me if you like what I’ve done in this article and think I can create value for your company with my skills.