First - you will need extension method to split rows by tiers:
public static IEnumerable<IEnumerable<T>> SplitBy<T>(
this IEnumerable<T> source, Func<T, bool> separator)
{
List<T> batch = new List<T>();
using (var iterator = source.GetEnumerator())
{
while (iterator.MoveNext())
{
if (separator(iterator.Current) && batch.Any())
{
yield return batch;
batch = new List<T>();
}
batch.Add(iterator.Current);
}
}
if (batch.Any())
yield return batch;
}
Now first step will be querying tiers (each will contain several tr
nodes):
HtmlDocument doc = new HtmlDocument();
doc.Load(path_to_html);
var tiers = doc.DocumentNode.SelectNodes("//tr")
.SplitBy(tr => tr.HasAttributes &&
tr.Attributes["class"].Value == "header");
Second step is extracting data from each tier
var result = from t in tiers
let tier = t.First().SelectSingleNode("th").InnerText
from a in t.Skip(1).SelectMany(tr => tr.SelectNodes("td/a"))
select new {
Tier = tier,
Value = a.InnerText
};
Result is
[
{ Tier: "Tier 1", Value: "First Thing" },
{ Tier: "Tier 1", Value: "Second Thing" },
{ Tier: "Tier 1", Value: "Third Thing" },
{ Tier: "Tier 1", Value: "Fourth Thing" },
{ Tier: "Tier 1", Value: "Fifth Thing" },
{ Tier: "Tier 1", Value: "Sixth Thing" },
{ Tier: "Tier 2", Value: "First Thing" },
{ Tier: "Tier 2", Value: "Second Thing" },
{ Tier: "Tier 2", Value: "Third Thing" },
{ Tier: "Tier 2", Value: "Fourth Thing" },
{ Tier: "Tier 2", Value: "Fifth Thing" },
{ Tier: "Tier 2", Value: "Sixth Thing" }
]