From 70af90556e0ed3fb9ba0e1ed20ed2995462b6c18 Mon Sep 17 00:00:00 2001 From: Milen Date: Tue, 9 Jan 2018 16:15:37 +0800 Subject: [PATCH 1/7] update netstandard2.0 --- .gitignore | 1 + NSoup.Tests/NSoup.Tests.csproj | 18 + NSoup.Tests/Test.cs | 78 + NSoup.sln | 35 +- NSoup/Helper/DataUtil.cs | 370 ++-- NSoup/Helper/DescendableLinkedList.cs | 5 +- NSoup/Helper/HttpConnection.cs | 2274 ++++++++++++------------- NSoup/Helper/LinkedHashSet.cs | 5 +- NSoup/Helper/StringUtil.cs | 314 ++-- NSoup/HttpStatusException.cs | 10 +- NSoup/IConnection.cs | 1024 ++++++----- NSoup/NSoup.cs | 9 +- NSoup/NSoup.csproj | 123 +- NSoup/NSoup.csproj.user | 2 + NSoup/NSoup.snk | Bin 596 -> 0 bytes NSoup/Nodes/Attribute.cs | 5 +- NSoup/Nodes/Attributes.cs | 4 +- NSoup/Nodes/Comment.cs | 7 +- NSoup/Nodes/DataNode.cs | 6 +- NSoup/Nodes/Document.cs | 1046 ++++++------ NSoup/Nodes/DocumentType.cs | 10 +- NSoup/Nodes/Node.cs | 36 +- NSoup/Nodes/TextNode.cs | 10 +- NSoup/Nodes/XmlDeclaration.cs | 5 +- NSoup/Parse/CharacterReader.cs | 12 +- NSoup/Parse/HtmlTreeBuilder.cs | 39 +- NSoup/Parse/HtmlTreeBuilderState.cs | 7 +- NSoup/Parse/ParseError.cs | 9 +- NSoup/Parse/ParseErrorList.cs | 5 +- NSoup/Parse/Parser.cs | 27 +- NSoup/Parse/Tag.cs | 2 - NSoup/Parse/Token.cs | 14 +- NSoup/Parse/TokenQueue.cs | 2 - NSoup/Parse/Tokeniser.cs | 12 +- NSoup/Parse/TokeniserState.cs | 21 +- NSoup/Parse/TreeBuilder.cs | 3 - NSoup/Parse/TreeBuilderState.cs | 2223 ------------------------ NSoup/Parse/XmlTreeBuilder.cs | 2 - NSoup/Properties/AssemblyInfo.cs | 41 - NSoup/Safety/Cleaner.cs | 8 +- NSoup/Safety/Whitelist.cs | 6 +- NSoup/Select/Collector.cs | 6 +- NSoup/Select/CombiningEvaluator.cs | 6 +- NSoup/Select/Elements.cs | 7 +- NSoup/Select/Evaluator.cs | 7 +- NSoup/Select/NodeTraversor.cs | 6 +- NSoup/Select/NodeVisitor.cs | 6 +- NSoup/Select/QueryParser.cs | 29 +- NSoup/Select/Selector.cs | 7 +- NSoup/Select/StructuralEvaluator.cs | 6 +- NSoup/UnsupportedMimeTypeException.cs | 8 +- Test/Test.csproj | 2 +- 52 files changed, 2783 insertions(+), 5137 deletions(-) create mode 100644 NSoup.Tests/NSoup.Tests.csproj create mode 100644 NSoup.Tests/Test.cs delete mode 100644 NSoup/NSoup.snk delete mode 100644 NSoup/Parse/TreeBuilderState.cs delete mode 100644 NSoup/Properties/AssemblyInfo.cs diff --git a/.gitignore b/.gitignore index 6083ece..fc0a7bc 100644 --- a/.gitignore +++ b/.gitignore @@ -109,3 +109,4 @@ UpgradeLog*.XML .sass-cache .sass-cache/* +/.vs/NSoup/v15/Server/sqlite3 diff --git a/NSoup.Tests/NSoup.Tests.csproj b/NSoup.Tests/NSoup.Tests.csproj new file mode 100644 index 0000000..3d23f7b --- /dev/null +++ b/NSoup.Tests/NSoup.Tests.csproj @@ -0,0 +1,18 @@ + + + + netcoreapp2.0 + + + + + + + + + + + + + + diff --git a/NSoup.Tests/Test.cs b/NSoup.Tests/Test.cs new file mode 100644 index 0000000..d9cce71 --- /dev/null +++ b/NSoup.Tests/Test.cs @@ -0,0 +1,78 @@ +using NSoup.Nodes; +using NSoup.Parse; +using NSoup.Select; +using System; +using System.Collections.Generic; +using System.Net.Http; +using System.Text; +using System.Threading.Tasks; +using Xunit; +namespace NSoup.Tests +{ + public class Http + { + static Http() + { + Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); + } + private readonly HttpClient httpClient; + private HttpClientHandler httpClientHandler; + public Http(Uri proxyAddrss = null, string proxyUserName = null, string proxyPassword = null) + { + + + if (proxyAddrss != null) + { + //启用代理 + httpClientHandler = new HttpClientHandler() + { + UseProxy = true, + UseCookies = true, + UseDefaultCredentials = false, + //Proxy = new WebProxy(proxyAddrss, true) + //{ + // Credentials = !string.IsNullOrWhiteSpace(proxyUserName) ? new NetworkCredential(proxyUserName, proxyPassword) : null + //} + }; + + } + else + { + httpClientHandler = new HttpClientHandler() { UseCookies = true }; + } + httpClient = new HttpClient(httpClientHandler); + httpClient.Timeout = new TimeSpan(1, 0, 0); + //client.DefaultRequestHeaders.Add("Accept-Encoding", "gzip"); + httpClient.DefaultRequestHeaders.Add("Connection", "Keep-Alive"); + + } + + public async Task GetHtml(string url, Dictionary headers = null) + { + httpClient.DefaultRequestHeaders.Host = (new Uri(url).Host); + if (headers != null) + { + foreach (var key in headers.Keys) + { + httpClient.DefaultRequestHeaders.Add(key, headers[key]); + } + } + + using (var result = await httpClient.GetAsync(url)) + { + return await result.Content.ReadAsStringAsync(); + } + } + } + public class Test + { + [Fact] + public void TestRun() + { + Http http = new Http(); + var html = http.GetHtml("https://dealer.autohome.com.cn/1120/info.html").Result; + Document doc = Parser.Parse(html, "http://dealer.autohome.com.cn/"); + Elements elems = doc.Select(".dealeron-cont .show-ul li img"); + } + } +} diff --git a/NSoup.sln b/NSoup.sln index 5a9105b..8e389f4 100644 --- a/NSoup.sln +++ b/NSoup.sln @@ -1,20 +1,13 @@  Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio 2012 -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{C66E2854-5CFF-4DD3-B867-3FE49DE1E96A}" - ProjectSection(SolutionItems) = preProject - LocalTestRun.testrunconfig = LocalTestRun.testrunconfig - NSoup.vsmdi = NSoup.vsmdi - EndProjectSection +# Visual Studio 15 +VisualStudioVersion = 15.0.27130.2010 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "NSoup", "NSoup\NSoup.csproj", "{EA189DC2-2C8D-4B50-BEE8-8964D6BEDF33}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "NSoup", "NSoup\NSoup.csproj", "{EA189DC2-2C8D-4B50-BEE8-8964D6BEDF33}" -EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Test", "Test\Test.csproj", "{B9BBCF9A-4E79-4A90-8401-C11D4D7A86B0}" +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "NSoup.Tests", "NSoup.Tests\NSoup.Tests.csproj", "{3DFFF8C7-74AC-45EB-B391-96AC7B01ADEA}" EndProject Global - GlobalSection(TestCaseManagementSettings) = postSolution - CategoryFile = NSoup.vsmdi - EndGlobalSection GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU Release|Any CPU = Release|Any CPU @@ -24,16 +17,18 @@ Global {EA189DC2-2C8D-4B50-BEE8-8964D6BEDF33}.Debug|Any CPU.Build.0 = Debug|Any CPU {EA189DC2-2C8D-4B50-BEE8-8964D6BEDF33}.Release|Any CPU.ActiveCfg = Release|Any CPU {EA189DC2-2C8D-4B50-BEE8-8964D6BEDF33}.Release|Any CPU.Build.0 = Release|Any CPU - {B9BBCF9A-4E79-4A90-8401-C11D4D7A86B0}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {B9BBCF9A-4E79-4A90-8401-C11D4D7A86B0}.Debug|Any CPU.Build.0 = Debug|Any CPU - {B9BBCF9A-4E79-4A90-8401-C11D4D7A86B0}.Release|Any CPU.ActiveCfg = Release|Any CPU - {B9BBCF9A-4E79-4A90-8401-C11D4D7A86B0}.Release|Any CPU.Build.0 = Release|Any CPU - {51AD4E15-891C-45D7-9AA1-B83A69E53B07}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {51AD4E15-891C-45D7-9AA1-B83A69E53B07}.Debug|Any CPU.Build.0 = Debug|Any CPU - {51AD4E15-891C-45D7-9AA1-B83A69E53B07}.Release|Any CPU.ActiveCfg = Release|Any CPU - {51AD4E15-891C-45D7-9AA1-B83A69E53B07}.Release|Any CPU.Build.0 = Release|Any CPU + {3DFFF8C7-74AC-45EB-B391-96AC7B01ADEA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {3DFFF8C7-74AC-45EB-B391-96AC7B01ADEA}.Debug|Any CPU.Build.0 = Debug|Any CPU + {3DFFF8C7-74AC-45EB-B391-96AC7B01ADEA}.Release|Any CPU.ActiveCfg = Release|Any CPU + {3DFFF8C7-74AC-45EB-B391-96AC7B01ADEA}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {6DBC92AF-821B-4500-8940-611594BF2095} + EndGlobalSection + GlobalSection(TestCaseManagementSettings) = postSolution + CategoryFile = NSoup.vsmdi + EndGlobalSection EndGlobal diff --git a/NSoup/Helper/DataUtil.cs b/NSoup/Helper/DataUtil.cs index 208a22f..10073cc 100644 --- a/NSoup/Helper/DataUtil.cs +++ b/NSoup/Helper/DataUtil.cs @@ -9,189 +9,189 @@ namespace NSoup.Helper { - /// - /// Internal static utilities for handling data. - /// - public class DataUtil - { - public static int BoundaryLength { get { return 32; } } - - private static readonly Regex _charsetPattern = new Regex("(?i)\\bcharset=\\s*\"?([^\\s;\"]*)", RegexOptions.Compiled); - private static readonly Encoding _defaultEncoding = Encoding.UTF8; - private static readonly char[] mimeBoundaryChars = "-_1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".ToArray(); - - private DataUtil() { } - - public static Encoding DefaultEncoding - { - get { return _defaultEncoding; } - } - - /// - /// Loads a file to a string. - /// - /// - /// - /// - public static Document Load(Stream input, string charsetName, string baseUri) - { - byte[] data = ReadToByteBuffer(input); - Document doc = ParseByteData(data, charsetName, baseUri, Parser.HtmlParser()); - input.Close(); - return doc; - } - - /// - /// Parses a Document from an input steam, using the provided Parser. - /// - /// Input stream to parse. You will need to close it - /// Character set of input - /// Base URI of document, to resolve relative links against - /// Alternate parser to use - /// - public static Document Load(Stream input, string charsetName, string baseUri, Parser parser) - { - byte[] data = ReadToByteBuffer(input); - - Document doc = ParseByteData(data, charsetName, baseUri, parser); - - input.Close(); - - return doc; - } - - // reads bytes first into a buffer, then decodes with the appropriate charset. done this way to support - // switching the chartset midstream when a meta http-equiv tag defines the charset. - public static Document ParseByteData(byte[] data, string charsetName, string baseUri, Parser parser) - { - var docData = string.Empty; - Document doc = null; - - if (charsetName == null) - { - // determine from meta. safe parse as UTF-8 - - // look for or HTML5 - docData = _defaultEncoding.GetString(data); - doc = parser.ParseInput(docData, baseUri); - Element meta = doc.Select("meta[http-equiv=content-type], meta[charset]").FirstOrDefault(); - - if (meta != null) - { - // if not found, will keep utf-8 as best attempt - string foundCharset = meta.HasAttr("http-equiv") ? GetCharsetFromContentType(meta.Attr("content")) : meta.Attr("charset"); - - if (foundCharset != null && foundCharset.Length != 0 && !foundCharset.Equals(_defaultEncoding.WebName.ToUpperInvariant())) - { // need to re-decode - charsetName = foundCharset; - - docData = Encoding.GetEncoding(foundCharset).GetString(data); - doc = null; - } - } - } - else - { - // specified by content type header (or by user on file load) - if (string.IsNullOrEmpty(charsetName)) - { - throw new Exception("Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML"); - } - - docData = Encoding.GetEncoding(charsetName).GetString(data); - } - - if (doc == null) - { - // there are times where there is a spurious byte-order-mark at the start of the text. Shouldn't be present - // in utf-8. If after decoding, there is a BOM, strip it; otherwise will cause the parser to go straight - // into head mode - if (docData.Length > 0 && docData[0] == 65279) - { - docData = docData.Substring(1); - } - - doc = parser.ParseInput(docData, baseUri); - doc.OutputSettings().SetEncoding(charsetName); - } - return doc; - } - - public static byte[] ReadToByteBuffer(Stream input) - { - using (MemoryStream ms = new MemoryStream()) - { - byte[] buffer = new byte[32768]; - - int count = input.Read(buffer, 0, buffer.Length); - ms.Write(buffer, 0, count); - - while (count > 0) - { - count = input.Read(buffer, 0, buffer.Length); - ms.Write(buffer, 0, count); - } - - return ms.ToArray(); - } - } - - public static string MimeBoundary() - { - var mime = new StringBuilder(BoundaryLength); - var rand = new Random(); - for (var i = 0; i < BoundaryLength; i++) - { - mime.Append(mimeBoundaryChars[rand.Next(mimeBoundaryChars.Length)]); - } - - return mime.ToString(); - } - - - /// - /// Parse out a charset from a content type header. If the charset is not supported, returns null (so the default - /// will kick in.) - /// - /// e.g. "text/html; charset=EUC-JP" - /// "EUC-JP", or null if not found. Charset is trimmed and uppercased. - internal static string GetCharsetFromContentType(string contentType) - { - if (contentType == null) - { - return null; - } - - Match m = _charsetPattern.Match(contentType); - if (m.Success) - { - string charset = m.Groups[1].Value.Trim(); - charset = charset.Replace("charset=", string.Empty); - var pattern = "[\',]"; - var regEx = new Regex(pattern); - charset = regEx.Replace(charset, string.Empty); - - if (charset.Length == 0) - { - return null; - } - - try - { - Encoding.GetEncoding(charset); - return charset; - } - catch (Exception e) { var a = e.Message; } - - charset = charset.ToUpper(CultureInfo.CreateSpecificCulture("en-US")); - try - { - Encoding.GetEncoding(charset); - return charset; - } - catch (Exception e) { var a = e.Message; } - } - return null; - } - } + /// + /// Internal static utilities for handling data. + /// + public class DataUtil + { + public static int BoundaryLength { get { return 32; } } + + private static readonly Regex _charsetPattern = new Regex("(?i)\\bcharset=\\s*\"?([^\\s;\"]*)", RegexOptions.Compiled); + private static readonly Encoding _defaultEncoding = Encoding.UTF8; + private static readonly char[] mimeBoundaryChars = "-_1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".ToArray(); + + private DataUtil() { } + + public static Encoding DefaultEncoding + { + get { return _defaultEncoding; } + } + + /// + /// Loads a file to a string. + /// + /// + /// + /// + public static Document Load(Stream input, string charsetName, string baseUri) + { + byte[] data = ReadToByteBuffer(input); + Document doc = ParseByteData(data, charsetName, baseUri, Parser.HtmlParser()); + input.Close(); + return doc; + } + + /// + /// Parses a Document from an input steam, using the provided Parser. + /// + /// Input stream to parse. You will need to close it + /// Character set of input + /// Base URI of document, to resolve relative links against + /// Alternate parser to use + /// + public static Document Load(Stream input, string charsetName, string baseUri, Parser parser) + { + byte[] data = ReadToByteBuffer(input); + + Document doc = ParseByteData(data, charsetName, baseUri, parser); + + input.Close(); + + return doc; + } + + // reads bytes first into a buffer, then decodes with the appropriate charset. done this way to support + // switching the chartset midstream when a meta http-equiv tag defines the charset. + public static Document ParseByteData(byte[] data, string charsetName, string baseUri, Parser parser) + { + var docData = string.Empty; + Document doc = null; + + if (charsetName == null) + { + // determine from meta. safe parse as UTF-8 + + // look for or HTML5 + docData = _defaultEncoding.GetString(data); + doc = parser.ParseInput(docData, baseUri); + Element meta = doc.Select("meta[http-equiv=content-type], meta[charset]").FirstOrDefault(); + + if (meta != null) + { + // if not found, will keep utf-8 as best attempt + string foundCharset = meta.HasAttr("http-equiv") ? GetCharsetFromContentType(meta.Attr("content")) : meta.Attr("charset"); + + if (foundCharset != null && foundCharset.Length != 0 && !foundCharset.Equals(_defaultEncoding.WebName.ToUpperInvariant())) + { // need to re-decode + charsetName = foundCharset; + + docData = Encoding.GetEncoding(foundCharset).GetString(data); + doc = null; + } + } + } + else + { + // specified by content type header (or by user on file load) + if (string.IsNullOrEmpty(charsetName)) + { + throw new Exception("Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML"); + } + + docData = Encoding.GetEncoding(charsetName).GetString(data); + } + + if (doc == null) + { + // there are times where there is a spurious byte-order-mark at the start of the text. Shouldn't be present + // in utf-8. If after decoding, there is a BOM, strip it; otherwise will cause the parser to go straight + // into head mode + if (docData.Length > 0 && docData[0] == 65279) + { + docData = docData.Substring(1); + } + + doc = parser.ParseInput(docData, baseUri); + doc.OutputSettings().SetEncoding(charsetName); + } + return doc; + } + + public static byte[] ReadToByteBuffer(Stream input) + { + using (MemoryStream ms = new MemoryStream()) + { + byte[] buffer = new byte[32768]; + + int count = input.Read(buffer, 0, buffer.Length); + ms.Write(buffer, 0, count); + + while (count > 0) + { + count = input.Read(buffer, 0, buffer.Length); + ms.Write(buffer, 0, count); + } + + return ms.ToArray(); + } + } + + public static string MimeBoundary() + { + var mime = new StringBuilder(BoundaryLength); + var rand = new Random(); + for (var i = 0; i < BoundaryLength; i++) + { + mime.Append(mimeBoundaryChars[rand.Next(mimeBoundaryChars.Length)]); + } + + return mime.ToString(); + } + + + /// + /// Parse out a charset from a content type header. If the charset is not supported, returns null (so the default + /// will kick in.) + /// + /// e.g. "text/html; charset=EUC-JP" + /// "EUC-JP", or null if not found. Charset is trimmed and uppercased. + internal static string GetCharsetFromContentType(string contentType) + { + if (contentType == null) + { + return null; + } + + Match m = _charsetPattern.Match(contentType); + if (m.Success) + { + string charset = m.Groups[1].Value.Trim(); + charset = charset.Replace("charset=", string.Empty); + var pattern = "[\',]"; + var regEx = new Regex(pattern); + charset = regEx.Replace(charset, string.Empty); + + if (charset.Length == 0) + { + return null; + } + + try + { + Encoding.GetEncoding(charset); + return charset; + } + catch (Exception e) { var a = e.Message; } + + charset = charset.ToUpper(CultureInfo.CreateSpecificCulture("en-US")); + try + { + Encoding.GetEncoding(charset); + return charset; + } + catch (Exception e) { var a = e.Message; } + } + return null; + } + } } \ No newline at end of file diff --git a/NSoup/Helper/DescendableLinkedList.cs b/NSoup/Helper/DescendableLinkedList.cs index 2386b27..47c767f 100644 --- a/NSoup/Helper/DescendableLinkedList.cs +++ b/NSoup/Helper/DescendableLinkedList.cs @@ -1,7 +1,4 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; +using System.Collections.Generic; namespace NSoup.Helper { diff --git a/NSoup/Helper/HttpConnection.cs b/NSoup/Helper/HttpConnection.cs index b3be6a4..60bc106 100644 --- a/NSoup/Helper/HttpConnection.cs +++ b/NSoup/Helper/HttpConnection.cs @@ -10,1141 +10,1141 @@ namespace NSoup.Helper { - public class HttpConnection : IConnection - { - #region IConnection Members - - public static readonly string CONTENT_ENCODING = "Content-Encoding"; - - public static IConnection Connect(string url) - { - var con = new HttpConnection(); - con.Url(url); - return con; - } - - public static IConnection Connect(Uri url) - { - var con = new HttpConnection(); - con.Url(url); - return con; - } - - private static string EncodeUrl(string url) - { - if(string.IsNullOrWhiteSpace(url)) - { - return null; - } - - return url.Replace(" ", "%20"); - } - - private static string EncodeMimeName(string val) - { - if (string.IsNullOrWhiteSpace(val)) - { - return null; - } - - return val.Replace("\"", "%22"); - } - - private IRequest req; - private IResponse res; - - private HttpConnection() - { - req = new Request(); - res = new Response(); - } - - public IConnection Url(Uri url) - { - req.Url(url); - return this; - } - - public IConnection Url(string url) - { - if (string.IsNullOrEmpty(url)) - { - throw new ArgumentException("Must supply a valid URL", "url"); - } - - try - { - req.Url(new Uri(url)); - } - catch (UriFormatException e) - { - throw new ArgumentException("Malformed URL: " + url, e); - } - return this; - } - - public IConnection UserAgent(string userAgent) - { - if (userAgent == null) - { - throw new ArgumentNullException("userAgent"); - } - - req.Header("User-Agent", userAgent); - - return this; - } - - public IConnection Timeout(int millis) - { - req.Timeout(millis); - return this; - } - - public IConnection MaxBodySize(int bytes) - { - req.MaxBodySize(bytes); - return this; - } - - public IConnection FollowRedirects(bool followRedirects) - { - req.FollowRedirects(followRedirects); - return this; - } - - public IConnection Referrer(string referrer) - { - if (referrer == null) - { - throw new ArgumentNullException("referrer"); - } - - req.Header("Referer", referrer); // Note "Referer" is the actual header spelling. - - return this; - } - - public IConnection Method(Method method) - { - req.Method(method); - return this; - } - - public IConnection IgnoreHttpErrors(bool ignoreHttpErrors) - { - req.IgnoreHttpErrors(ignoreHttpErrors); - return this; - } - - public IConnection IgnoreContentType(bool ignoreContentType) - { - req.IgnoreContentType(ignoreContentType); - return this; - } - - public IConnection ValidateTLSCertificates(bool value) - { - req.ValidateTLSCertificates(value); - return this; - } - - public IConnection Data(string key, string value) - { - req.Data(KeyVal.Create(key, value)); - return this; - } - - public IConnection Data(string key, string fileName, Stream stream) - { - req.Data(KeyVal.Create(key, fileName, stream)); - return this; - } - - public IConnection Data(IDictionary data) - { - if (data == null) - { - throw new ArgumentNullException("data"); - } - - foreach (KeyValuePair entry in data) - { - req.Data(KeyVal.Create(entry.Key, entry.Value)); - } - - return this; - } - - public IConnection Data(params string[] keyvals) - { - if (keyvals == null) - { - throw new ArgumentNullException("keyvals"); - } - - if ((keyvals.Length % 2) != 0) - { - throw new InvalidOperationException("Must supply an even number of key value pairs"); - } - - for (int i = 0; i < keyvals.Length; i += 2) - { - var key = keyvals[i]; - var value = keyvals[i + 1]; - - if (string.IsNullOrWhiteSpace(key)) - { - throw new ArgumentException("Data key must not be empty"); - } - - if (value == null) - { - throw new ArgumentException("Data value must not be null"); - } - - req.Data(KeyVal.Create(key, value)); - } - - return this; - } - - public IConnection Header(string name, string value) - { - req.Header(name, value); - return this; - } - - public IConnection Cookie(string name, string value) - { - req.Cookie(name, value); - return this; - } - - public IConnection Cookies(IDictionary cookies) - { - if (cookies == null) - { - throw new ArgumentNullException("cookies"); - } - foreach (var entry in cookies) - { - req.Cookie(entry.Key, entry.Value); - } - - return this; - } - - public IConnection Parser(Parser parser) - { - req.Parser(parser); - return this; - } - - public Document Get() - { - req.Method(NSoup.Method.Get); - Execute(); - return res.Parse(); - } - - public Document Post() - { - req.Method(NSoup.Method.Post); - Execute(); - return res.Parse(); - } - - public IResponse Execute() - { - res = Helper.Response.Execute(req); - return res; - } - - public IRequest Request() - { - return req; - } - - public IConnection Request(IRequest request) - { - req = request; - return this; - } - - public IResponse Response() - { - return res; - } - - public IConnection Response(IResponse response) - { - res = response; - return this; - } - - #endregion - } - - public abstract class ConnectionBase : IConnectionBase where T : IConnectionBase - { - protected Uri url; - protected Method method; - protected IDictionary headers; - protected IDictionary cookies; - - private class CaseInsensitiveComparer : IComparer - { - #region IComparer Members - - public int Compare(string x, string y) - { - return string.Compare(x, y, true); - } - - #endregion - } - - protected ConnectionBase() - { - headers = new SortedDictionary(new CaseInsensitiveComparer()); - cookies = new SortedDictionary(); - } - - public Uri Url() - { - return url; - } - - public IConnectionBase Url(Uri url) - { - if (url == null) - { - throw new ArgumentNullException("url"); - } - - this.url = url; - return this; - } - - public Method Method() - { - return method; - } - - public IConnectionBase Method(Method method) - { - this.method = method; - return this; - } - - public string Header(string name) - { - if (name == null) - { - throw new ArgumentNullException("name"); - } - - return GetHeaderCaseInsensitive(name); - } - - public IConnectionBase Header(string name, string value) - { - if (string.IsNullOrEmpty(name)) - { - throw new ArgumentException("Header name must not be empty", "name"); - } - - if (value == null) - { - throw new ArgumentNullException("value"); - } - - RemoveHeader(name); // ensures we don't get an "accept-encoding" and a "Accept-Encoding" - - headers[name] = value; - - return this; - } - - public bool HasHeader(string name) - { - if (string.IsNullOrEmpty(name)) - { - throw new ArgumentException("Header name must not be empty", "name"); - } - - return GetHeaderCaseInsensitive(name) != null; - } - - public bool HasHeaderWithValue(String name, String value) - { - return HasHeader(name) && Header(name).Equals(value, StringComparison.InvariantCultureIgnoreCase); - } - - public IConnectionBase RemoveHeader(string name) - { - if (string.IsNullOrEmpty(name)) - { - throw new ArgumentException("Header name must not be empty", "name"); - } - - KeyValuePair? entry = ScanHeaders(name); // remove is case insensitive too - if (entry != null) - { - headers.Remove(entry.Value.Key); // ensures correct case - } - - return this; - } - - public IDictionary Headers() - { - return headers; - } - - private string GetHeaderCaseInsensitive(string name) - { - if (name == null) - { - throw new ArgumentNullException("name", "Header name must not be null"); - } - - // quick evals for common case of title case, lower case, then scan for mixed - string value = null; - - if (!headers.TryGetValue(name, out value)) // Also case insensitive thanks to the CaseInsensitiveComparer. - { - KeyValuePair? entry = ScanHeaders(name); - if (entry != null) - { - value = entry.Value.Value; - } - } - - return value; - } - - private KeyValuePair? ScanHeaders(string name) - { - var lc = name.ToLowerInvariant(); - foreach (var entry in headers) - { - if (entry.Key.ToLowerInvariant().Equals(lc)) - { - return entry; - } - } - return null; - } - - public string Cookie(string name) - { - if (name == null) - { - throw new ArgumentNullException("name"); - } - - return cookies[name]; - } - - public IConnectionBase Cookie(string name, string value) - { - if (string.IsNullOrEmpty(name)) - { - throw new ArgumentException("Cookie name must not be empty", "name"); - } - - if (value == null) - { - throw new ArgumentNullException("value"); - } - - cookies[name] = value; - - return this; - } - - public bool HasCookie(string name) - { - if (string.IsNullOrEmpty(name)) - { - throw new ArgumentException("Cookie name must not be empty", "name"); - } - - return cookies.ContainsKey(name); - } - - public IConnectionBase RemoveCookie(string name) - { - if (string.IsNullOrEmpty(name)) - { - throw new ArgumentException("Cookie name must not be empty", "name"); - } - - cookies.Remove(name); - - return this; - } - - public IDictionary Cookies() - { - return cookies; - } - } - - public class Response : ConnectionBase, IResponse - { - private static readonly int MAX_REDIRECTS = 20; - private HttpStatusCode statusCode; - private string statusMessage; - private byte[] byteData; - private string charset; - private string contentType; - private bool executed = false; - private int numRedirects = 0; - private IRequest req; - - public Response() - : base() - { } - - private Response(IResponse previousResponse) - : base() - { - if (previousResponse != null) - { - numRedirects = previousResponse.NumRedirects + 1; - if (numRedirects >= MAX_REDIRECTS) - { - throw new IOException(string.Format("Too many redirects occurred trying to load URL {0}", previousResponse.Url())); - } - } - } - - public static Response Execute(IRequest req) - { - return Execute(req, null); - } - - public static Response Execute(IRequest req, IResponse previousResponse) - { - if (req == null) - { - throw new ArgumentNullException("req", "Request must not be null"); - } - - string protocol = req.Url().Scheme; - - if (!protocol.Equals("http") && !protocol.Equals("https")) - { - // throw new MalformedURLException - throw new InvalidOperationException("Only http & https protocols supported"); - } - - // set up the request for execution - if (req.Method() == NSoup.Method.Get && req.Data().Count > 0) - { - SerialiseRequestUrl(req); // appends query string - } - - var conn = CreateConnection(req); - HttpWebResponse response = null; - - try - { - if (req.Method() == NSoup.Method.Post) - { - conn.ContentType = "application/x-www-form-urlencoded"; - WritePost(req.Data(), conn.GetRequestStream()); - } - - response = (HttpWebResponse)conn.GetResponse(); - } - catch (WebException e) - { - response = e.Response as HttpWebResponse; - - if (response != null) - { - return ProcessResponse(response, req, previousResponse); - } - } - - return ProcessResponse(response, req, previousResponse); - } - - private static Response ProcessResponse(HttpWebResponse response, IRequest req, IResponse previousResponse) - { - var needsRedirect = false; - var status = response.StatusCode; - - if (status != HttpStatusCode.OK) - { - if (status == HttpStatusCode.Found || status == HttpStatusCode.MovedPermanently || status == HttpStatusCode.SeeOther) - { - // In .NET (4.0+ ?), Moved and MovedPermanently have the same value. - needsRedirect = true; - } - else if (!req.IgnoreHttpErrors()) - { - throw new HttpStatusException("HTTP error fetching URL", (int)status, req.Url().ToString()); - } - } - - var res = new Response(previousResponse); - res.SetupFromConnection(response, previousResponse); - - if (needsRedirect && req.FollowRedirects()) - { - req.Method(NSoup.Method.Get); - req.Data().Clear(); - req.Url(new Uri(req.Url(), res.Header("Location"))); - - foreach (var cookie in res.Cookies()) // add response cookies to request (for e.g. login posts) - { - req.Cookie(cookie.Key, cookie.Value); - } - - return Execute(req, res); - } - - res.req = req; - - // check that we can handle the returned content type; if not, abort before fetching it - var contentType = res.ContentType(); - - if (contentType != null && !req.IgnoreContentType() && (!(contentType.StartsWith("text/") || contentType.StartsWith("application/xml") || contentType.StartsWith("application/xhtml+xml")))) - { - throw new UnsupportedMimeTypeException("Unhandled content type. Must be text/*, application/xml, or application/xhtml+xml", - contentType, req.Url().ToString()); - } - - //dataStream = conn.getErrorStream() != null ? conn.getErrorStream() : conn.getInputStream(); - //bodyStream = res.hasHeader("Content-Encoding") && res.header("Content-Encoding").equalsIgnoreCase("gzip") ? - // new BufferedInputStream(new GZIPInputStream(dataStream)) : - // new BufferedInputStream(dataStream); - - using (var inStream = - (res.HasHeader("Content-Encoding") && res.Header("Content-Encoding").Equals("gzip")) ? - new GZipStream(response.GetResponseStream(), CompressionMode.Decompress) : - response.GetResponseStream()) - { - res.byteData = DataUtil.ReadToByteBuffer(inStream); - res.charset = DataUtil.GetCharsetFromContentType(res.ContentType()); // may be null, readInputStream deals with it - } - - res.executed = true; - - return res; - } - - public HttpStatusCode StatusCode() - { - return statusCode; - } - - public string StatusMessage() - { - return statusMessage; - } - - public string Charset() - { - return charset; - } - - public string ContentType() - { - return contentType; - } - - public Document Parse() - { - if (!executed) - { - throw new InvalidOperationException("Request must be executed (with .Execute(), .Get(), or .Post() before parsing response "); - } - - var doc = DataUtil.ParseByteData(byteData, charset, url.ToString(), req.Parser()); - - charset = doc.OutputSettings().Encoding.WebName.ToUpperInvariant(); // update charset from meta-equiv, possibly - return doc; - } - - public string Body() - { - if (!executed) - { - throw new InvalidOperationException("Request must be executed (with .Execute(), .Get(), or .Post() before getting response body"); - } - - // charset gets set from header on execute, and from meta-equiv on parse. parse may not have happened yet - return string.IsNullOrWhiteSpace(charset) ? DataUtil.DefaultEncoding.GetString(byteData) : - Encoding.GetEncoding(charset).GetString(byteData); - } - - public byte[] BodyAsBytes() - { - if (!executed) - { - throw new InvalidOperationException("Request must be executed (with .Execute(), .Get(), or .Post() before getting response body"); - } - return byteData; - } - - // set up connection defaults, and details from request - private static HttpWebRequest CreateConnection(IRequest req) - { - var conn = (HttpWebRequest)HttpWebRequest.Create(req.Url()); - conn.Method = req.Method().ToString(); - conn.AllowAutoRedirect = false; // don't rely on native redirection support - conn.Timeout = req.Timeout(); - conn.ReadWriteTimeout = req.Timeout(); - - if (req.Cookies().Count > 0) - { - conn.Headers.Add(HttpRequestHeader.Cookie, GetRequestCookieString(req)); - } - - // Added due to incosistent behavior by .NET when trying to add this header. - if (req.HasHeader("Referer")) - { - conn.Referer = req.Header("Referer"); - req.RemoveHeader("Referer"); - } - - // Same as above. - if (req.HasHeader("User-Agent")) - { - conn.UserAgent = req.Header("User-Agent"); - req.RemoveHeader("User-Agent"); - } - - foreach (KeyValuePair header in req.Headers()) - { - conn.Headers.Add(header.Key, header.Value); - } - - return conn; - } - - - - // set up url, method, header, cookies - private void SetupFromConnection(HttpWebResponse conn, IResponse previousResponse) - { - - method = (Method)Enum.Parse(typeof(Method), conn.Method, true); - - url = conn.ResponseUri; - statusCode = conn.StatusCode; - statusMessage = conn.StatusDescription; - contentType = conn.ContentType; - - // headers into map - var resHeaders = conn.Headers; - - ProcessResponseHeaders(resHeaders); - - // if from a redirect, map previous response cookies into this response - if (previousResponse != null) - { - foreach (var prevCookie in previousResponse.Cookies()) - { - if (!HasCookie(prevCookie.Key)) - { - Cookie(prevCookie.Key, prevCookie.Value); - } - } - } - } - - public void ProcessResponseHeaders(WebHeaderCollection resHeaders) - { - foreach (string name in resHeaders.Keys) - { - if (string.IsNullOrWhiteSpace(name)) - { - continue; // http/1.1 line - } - - var value = resHeaders[name]; //.Split(';'); - - if (name.Equals("Set-Cookie", StringComparison.InvariantCultureIgnoreCase)) - { - var values = resHeaders["Set-Cookie"].Split(';', ','); - foreach (string v in values) - { - if (string.IsNullOrWhiteSpace(v)) - { - continue; - } - - var cd = new TokenQueue(v); - var cookieName = cd.ChompTo("=").Trim(); - var cookieVal = cd.ConsumeTo(";").Trim(); - - if (cookieVal == null) - { - cookieVal = string.Empty; - } - - // ignores path, date, domain, secure et al. req'd? - if (StringUtil.In(cookieName.ToLowerInvariant(), "domain", "path", "expires", "max-age", "secure", "httponly")) - { - // This is added for NSoup, since we do headers a bit differently around here. - continue; - } - - // name not blank, value not null - if (!string.IsNullOrEmpty(cookieName)) - { - Cookie(cookieName, cookieVal); - } - } - } - else - { - if (!string.IsNullOrEmpty(value)) - { - Header(name, /*values[0]*/ value); - } - } - } - } - - private static void WritePost(ICollection data, Stream outputStream) - { - var sb = new StringBuilder(); - - var first = true; - foreach (var keyVal in data) - { - if (!first) - { - sb.Append('&'); - } - else - { - first = false; - } - - sb.Append(HttpUtility.UrlEncode(keyVal.Key(), DataUtil.DefaultEncoding)) - .Append('=') - .Append(HttpUtility.UrlEncode(keyVal.Value(), DataUtil.DefaultEncoding)); - } - - var bytes = DataUtil.DefaultEncoding.GetBytes(sb.ToString()); - - outputStream.Write(bytes, 0, bytes.Length); - outputStream.Close(); - } - - private static string GetRequestCookieString(IRequest req) - { - var sb = new StringBuilder(); - var first = true; - foreach (var cookie in req.Cookies()) - { - if (!first) - { - sb.Append("; "); - } - else - { - first = false; - } - sb.Append(cookie.Key).Append('=').Append(cookie.Value); - // todo: spec says only ascii, no escaping / encoding defined. validate on set? or escape somehow here? - } - return sb.ToString(); - } - - // for get url reqs, serialise the data map into the url - private static void SerialiseRequestUrl(IRequest req) - { - var input = req.Url(); - var url = new StringBuilder(); - var first = true; - // reconstitute the query, ready for appends - url.Append(input.Scheme).Append("://").Append(input.Authority) - .Append(input.AbsolutePath).Append("?"); - - if (!string.IsNullOrEmpty(input.Query)) - { - url.Append(input.Query); - first = false; - } - - foreach (var keyVal in req.Data()) - { - if (!first) - { - url.Append('&'); - } - else - { - first = false; - } - - url.Append(HttpUtility.UrlEncode(keyVal.Key(), DataUtil.DefaultEncoding)) - .Append('=').Append(HttpUtility.UrlEncode(keyVal.Value(), DataUtil.DefaultEncoding)); - } - - req.Url(new Uri(url.ToString())); - req.Data().Clear(); // moved into url as get params - } - - public int NumRedirects - { - get - { - return numRedirects; - } - } - } - - public class Request : ConnectionBase, IRequest - { - private int timeoutMilliseconds; - private int maxBodySizeBytes; - private bool followRedirects; - private ICollection data; - private bool ignoreHttpErrors = false; - private bool ignoreContentType = false; - private Parser parser; - private bool parserDefined = false; // called parser(...) vs initialized in ctor - private bool validateTSLCertificates = true; - private string postDataCharset = DataUtil.DefaultEncoding.ToString(); - - public Request() - { - timeoutMilliseconds = 3000; - maxBodySizeBytes = 1024 * 1024; - followRedirects = true; - data = new List(); - method = NSoup.Method.Get; - headers["Accept-Encoding"] = "gzip"; - parser = Parse.Parser.HtmlParser(); - } - - public int Timeout() - { - return timeoutMilliseconds; - } - - public IRequest Timeout(int millis) - { - if (millis < 0) - { - throw new ArgumentOutOfRangeException("Timeout milliseconds must be 0 (infinite) or greater"); - } - - timeoutMilliseconds = millis; - - return this; - } - - public int MaxBodySize() - { - return maxBodySizeBytes; - } - - public IRequest MaxBodySize(int bytes) - { - if (bytes < 0) - { - throw new ArgumentOutOfRangeException("Max Size must be 0 (infinite) or greater"); - } - - maxBodySizeBytes = bytes; - return this; - } - - public bool FollowRedirects() - { - return followRedirects; - } - - public IRequest FollowRedirects(bool followRedirects) - { - this.followRedirects = followRedirects; - return this; - } - - public bool IgnoreHttpErrors() - { - return ignoreHttpErrors; - } - - public IRequest IgnoreHttpErrors(bool ignoreHttpErrors) - { - this.ignoreHttpErrors = ignoreHttpErrors; - return this; - } - - public bool ValidateTLSCertificates() - { - return validateTSLCertificates; - } - - public void ValidateTLSCertificates(bool value) - { - validateTSLCertificates = value; - } - - public bool IgnoreContentType() - { - return ignoreContentType; - } - - public IRequest IgnoreContentType(bool ignoreContentType) - { - this.ignoreContentType = ignoreContentType; - return this; - } - - public IRequest Data(KeyVal keyval) - { - if (keyval == null) - { - throw new ArgumentNullException("keyval"); - } - - data.Add(keyval); - - return this; - } - - public ICollection Data() - { - return data; - } - - public IRequest Parser(Parser parser) - { - this.parser = parser; - return this; - } - - public Parser Parser() - { - return parser; - } - } - - public class KeyVal : IKeyVal - { - private string key; - private string value; - private Stream stream; - - public static KeyVal Create(string key, string value) - { - if (string.IsNullOrEmpty(key)) - { - throw new ArgumentException("Data key must not be empty", "key"); - } - - if (value == null) - { - throw new ArgumentNullException("value", "Data value must not be null"); - } - - return new KeyVal(key, value); - } - - public static KeyVal Create(string key, string value, Stream stream) - { - if (string.IsNullOrEmpty(key)) - { - throw new ArgumentException("Data key must not be empty", "key"); - } - - if (value == null) - { - throw new ArgumentNullException("value", "Data value must not be null"); - } - - return new KeyVal(key, value, stream); - } - - private KeyVal(string key, string value) - { - this.key = key; - this.value = value; - } - - private KeyVal(string key, string value, Stream stream) - { - this.key = key; - this.value = value; - this.stream = stream; - } - - #region IKeyVal Members - - public IKeyVal Key(string key) - { - if (string.IsNullOrEmpty(key)) - { - throw new ArgumentException("Data key must not be empty", "key"); - } - - this.key = key; - - return this; - } - - public string Key() - { - return key; - } - - public IKeyVal Value(string value) - { - if (value == null) - { - throw new ArgumentNullException("value", "Data value must not be null"); - } - - this.value = value; - - return this; - } - - public string Value() - { - return value; - } - - public IKeyVal InputStream(Stream inputStream) - { - if (inputStream == null) - { - throw new ArgumentNullException("inputStream", "Data input stream must not be null"); - } - - this.stream = inputStream; - return this; - } - - public Stream InputStream() - { - return stream; - } - - public bool HasInputStream() - { - return stream != null; - } - - #endregion - - public override string ToString() - { - return string.Concat(key, "=", value); - } - } + public class HttpConnection : IConnection + { + #region IConnection Members + + public static readonly string CONTENT_ENCODING = "Content-Encoding"; + + public static IConnection Connect(string url) + { + var con = new HttpConnection(); + con.Url(url); + return con; + } + + public static IConnection Connect(Uri url) + { + var con = new HttpConnection(); + con.Url(url); + return con; + } + + private static string EncodeUrl(string url) + { + if (string.IsNullOrWhiteSpace(url)) + { + return null; + } + + return url.Replace(" ", "%20"); + } + + private static string EncodeMimeName(string val) + { + if (string.IsNullOrWhiteSpace(val)) + { + return null; + } + + return val.Replace("\"", "%22"); + } + + private IRequest req; + private IResponse res; + + private HttpConnection() + { + req = new Request(); + res = new Response(); + } + + public IConnection Url(Uri url) + { + req.Url(url); + return this; + } + + public IConnection Url(string url) + { + if (string.IsNullOrEmpty(url)) + { + throw new ArgumentException("Must supply a valid URL", "url"); + } + + try + { + req.Url(new Uri(url)); + } + catch (UriFormatException e) + { + throw new ArgumentException("Malformed URL: " + url, e); + } + return this; + } + + public IConnection UserAgent(string userAgent) + { + if (userAgent == null) + { + throw new ArgumentNullException("userAgent"); + } + + req.Header("User-Agent", userAgent); + + return this; + } + + public IConnection Timeout(int millis) + { + req.Timeout(millis); + return this; + } + + public IConnection MaxBodySize(int bytes) + { + req.MaxBodySize(bytes); + return this; + } + + public IConnection FollowRedirects(bool followRedirects) + { + req.FollowRedirects(followRedirects); + return this; + } + + public IConnection Referrer(string referrer) + { + if (referrer == null) + { + throw new ArgumentNullException("referrer"); + } + + req.Header("Referer", referrer); // Note "Referer" is the actual header spelling. + + return this; + } + + public IConnection Method(Method method) + { + req.Method(method); + return this; + } + + public IConnection IgnoreHttpErrors(bool ignoreHttpErrors) + { + req.IgnoreHttpErrors(ignoreHttpErrors); + return this; + } + + public IConnection IgnoreContentType(bool ignoreContentType) + { + req.IgnoreContentType(ignoreContentType); + return this; + } + + public IConnection ValidateTLSCertificates(bool value) + { + req.ValidateTLSCertificates(value); + return this; + } + + public IConnection Data(string key, string value) + { + req.Data(KeyVal.Create(key, value)); + return this; + } + + public IConnection Data(string key, string fileName, Stream stream) + { + req.Data(KeyVal.Create(key, fileName, stream)); + return this; + } + + public IConnection Data(IDictionary data) + { + if (data == null) + { + throw new ArgumentNullException("data"); + } + + foreach (KeyValuePair entry in data) + { + req.Data(KeyVal.Create(entry.Key, entry.Value)); + } + + return this; + } + + public IConnection Data(params string[] keyvals) + { + if (keyvals == null) + { + throw new ArgumentNullException("keyvals"); + } + + if ((keyvals.Length % 2) != 0) + { + throw new InvalidOperationException("Must supply an even number of key value pairs"); + } + + for (int i = 0; i < keyvals.Length; i += 2) + { + var key = keyvals[i]; + var value = keyvals[i + 1]; + + if (string.IsNullOrWhiteSpace(key)) + { + throw new ArgumentException("Data key must not be empty"); + } + + if (value == null) + { + throw new ArgumentException("Data value must not be null"); + } + + req.Data(KeyVal.Create(key, value)); + } + + return this; + } + + public IConnection Header(string name, string value) + { + req.Header(name, value); + return this; + } + + public IConnection Cookie(string name, string value) + { + req.Cookie(name, value); + return this; + } + + public IConnection Cookies(IDictionary cookies) + { + if (cookies == null) + { + throw new ArgumentNullException("cookies"); + } + foreach (var entry in cookies) + { + req.Cookie(entry.Key, entry.Value); + } + + return this; + } + + public IConnection Parser(Parser parser) + { + req.Parser(parser); + return this; + } + + public Document Get() + { + req.Method(NSoup.Method.Get); + Execute(); + return res.Parse(); + } + + public Document Post() + { + req.Method(NSoup.Method.Post); + Execute(); + return res.Parse(); + } + + public IResponse Execute() + { + res = Helper.Response.Execute(req); + return res; + } + + public IRequest Request() + { + return req; + } + + public IConnection Request(IRequest request) + { + req = request; + return this; + } + + public IResponse Response() + { + return res; + } + + public IConnection Response(IResponse response) + { + res = response; + return this; + } + + #endregion + } + + public abstract class ConnectionBase : IConnectionBase where T : IConnectionBase + { + protected Uri url; + protected Method method; + protected IDictionary headers; + protected IDictionary cookies; + + private class CaseInsensitiveComparer : IComparer + { + #region IComparer Members + + public int Compare(string x, string y) + { + return string.Compare(x, y, true); + } + + #endregion + } + + protected ConnectionBase() + { + headers = new SortedDictionary(new CaseInsensitiveComparer()); + cookies = new SortedDictionary(); + } + + public Uri Url() + { + return url; + } + + public IConnectionBase Url(Uri url) + { + if (url == null) + { + throw new ArgumentNullException("url"); + } + + this.url = url; + return this; + } + + public Method Method() + { + return method; + } + + public IConnectionBase Method(Method method) + { + this.method = method; + return this; + } + + public string Header(string name) + { + if (name == null) + { + throw new ArgumentNullException("name"); + } + + return GetHeaderCaseInsensitive(name); + } + + public IConnectionBase Header(string name, string value) + { + if (string.IsNullOrEmpty(name)) + { + throw new ArgumentException("Header name must not be empty", "name"); + } + + if (value == null) + { + throw new ArgumentNullException("value"); + } + + RemoveHeader(name); // ensures we don't get an "accept-encoding" and a "Accept-Encoding" + + headers[name] = value; + + return this; + } + + public bool HasHeader(string name) + { + if (string.IsNullOrEmpty(name)) + { + throw new ArgumentException("Header name must not be empty", "name"); + } + + return GetHeaderCaseInsensitive(name) != null; + } + + public bool HasHeaderWithValue(String name, String value) + { + return HasHeader(name) && Header(name).Equals(value, StringComparison.InvariantCultureIgnoreCase); + } + + public IConnectionBase RemoveHeader(string name) + { + if (string.IsNullOrEmpty(name)) + { + throw new ArgumentException("Header name must not be empty", "name"); + } + + KeyValuePair? entry = ScanHeaders(name); // remove is case insensitive too + if (entry != null) + { + headers.Remove(entry.Value.Key); // ensures correct case + } + + return this; + } + + public IDictionary Headers() + { + return headers; + } + + private string GetHeaderCaseInsensitive(string name) + { + if (name == null) + { + throw new ArgumentNullException("name", "Header name must not be null"); + } + + // quick evals for common case of title case, lower case, then scan for mixed + string value = null; + + if (!headers.TryGetValue(name, out value)) // Also case insensitive thanks to the CaseInsensitiveComparer. + { + KeyValuePair? entry = ScanHeaders(name); + if (entry != null) + { + value = entry.Value.Value; + } + } + + return value; + } + + private KeyValuePair? ScanHeaders(string name) + { + var lc = name.ToLowerInvariant(); + foreach (var entry in headers) + { + if (entry.Key.ToLowerInvariant().Equals(lc)) + { + return entry; + } + } + return null; + } + + public string Cookie(string name) + { + if (name == null) + { + throw new ArgumentNullException("name"); + } + + return cookies[name]; + } + + public IConnectionBase Cookie(string name, string value) + { + if (string.IsNullOrEmpty(name)) + { + throw new ArgumentException("Cookie name must not be empty", "name"); + } + + if (value == null) + { + throw new ArgumentNullException("value"); + } + + cookies[name] = value; + + return this; + } + + public bool HasCookie(string name) + { + if (string.IsNullOrEmpty(name)) + { + throw new ArgumentException("Cookie name must not be empty", "name"); + } + + return cookies.ContainsKey(name); + } + + public IConnectionBase RemoveCookie(string name) + { + if (string.IsNullOrEmpty(name)) + { + throw new ArgumentException("Cookie name must not be empty", "name"); + } + + cookies.Remove(name); + + return this; + } + + public IDictionary Cookies() + { + return cookies; + } + } + + public class Response : ConnectionBase, IResponse + { + private static readonly int MAX_REDIRECTS = 20; + private HttpStatusCode statusCode; + private string statusMessage; + private byte[] byteData; + private string charset; + private string contentType; + private bool executed = false; + private int numRedirects = 0; + private IRequest req; + + public Response() + : base() + { } + + private Response(IResponse previousResponse) + : base() + { + if (previousResponse != null) + { + numRedirects = previousResponse.NumRedirects + 1; + if (numRedirects >= MAX_REDIRECTS) + { + throw new IOException(string.Format("Too many redirects occurred trying to load URL {0}", previousResponse.Url())); + } + } + } + + public static Response Execute(IRequest req) + { + return Execute(req, null); + } + + public static Response Execute(IRequest req, IResponse previousResponse) + { + if (req == null) + { + throw new ArgumentNullException("req", "Request must not be null"); + } + + string protocol = req.Url().Scheme; + + if (!protocol.Equals("http") && !protocol.Equals("https")) + { + // throw new MalformedURLException + throw new InvalidOperationException("Only http & https protocols supported"); + } + + // set up the request for execution + if (req.Method() == NSoup.Method.Get && req.Data().Count > 0) + { + SerialiseRequestUrl(req); // appends query string + } + + var conn = CreateConnection(req); + HttpWebResponse response = null; + + try + { + if (req.Method() == NSoup.Method.Post) + { + conn.ContentType = "application/x-www-form-urlencoded"; + WritePost(req.Data(), conn.GetRequestStream()); + } + + response = (HttpWebResponse)conn.GetResponse(); + } + catch (WebException e) + { + response = e.Response as HttpWebResponse; + + if (response != null) + { + return ProcessResponse(response, req, previousResponse); + } + } + + return ProcessResponse(response, req, previousResponse); + } + + private static Response ProcessResponse(HttpWebResponse response, IRequest req, IResponse previousResponse) + { + var needsRedirect = false; + var status = response.StatusCode; + + if (status != HttpStatusCode.OK) + { + if (status == HttpStatusCode.Found || status == HttpStatusCode.MovedPermanently || status == HttpStatusCode.SeeOther) + { + // In .NET (4.0+ ?), Moved and MovedPermanently have the same value. + needsRedirect = true; + } + else if (!req.IgnoreHttpErrors()) + { + throw new HttpStatusException("HTTP error fetching URL", (int)status, req.Url().ToString()); + } + } + + var res = new Response(previousResponse); + res.SetupFromConnection(response, previousResponse); + + if (needsRedirect && req.FollowRedirects()) + { + req.Method(NSoup.Method.Get); + req.Data().Clear(); + req.Url(new Uri(req.Url(), res.Header("Location"))); + + foreach (var cookie in res.Cookies()) // add response cookies to request (for e.g. login posts) + { + req.Cookie(cookie.Key, cookie.Value); + } + + return Execute(req, res); + } + + res.req = req; + + // check that we can handle the returned content type; if not, abort before fetching it + var contentType = res.ContentType(); + + if (contentType != null && !req.IgnoreContentType() && (!(contentType.StartsWith("text/") || contentType.StartsWith("application/xml") || contentType.StartsWith("application/xhtml+xml")))) + { + throw new UnsupportedMimeTypeException("Unhandled content type. Must be text/*, application/xml, or application/xhtml+xml", + contentType, req.Url().ToString()); + } + + //dataStream = conn.getErrorStream() != null ? conn.getErrorStream() : conn.getInputStream(); + //bodyStream = res.hasHeader("Content-Encoding") && res.header("Content-Encoding").equalsIgnoreCase("gzip") ? + // new BufferedInputStream(new GZIPInputStream(dataStream)) : + // new BufferedInputStream(dataStream); + + using (var inStream = + (res.HasHeader("Content-Encoding") && res.Header("Content-Encoding").Equals("gzip")) ? + new GZipStream(response.GetResponseStream(), CompressionMode.Decompress) : + response.GetResponseStream()) + { + res.byteData = DataUtil.ReadToByteBuffer(inStream); + res.charset = DataUtil.GetCharsetFromContentType(res.ContentType()); // may be null, readInputStream deals with it + } + + res.executed = true; + + return res; + } + + public HttpStatusCode StatusCode() + { + return statusCode; + } + + public string StatusMessage() + { + return statusMessage; + } + + public string Charset() + { + return charset; + } + + public string ContentType() + { + return contentType; + } + + public Document Parse() + { + if (!executed) + { + throw new InvalidOperationException("Request must be executed (with .Execute(), .Get(), or .Post() before parsing response "); + } + + var doc = DataUtil.ParseByteData(byteData, charset, url.ToString(), req.Parser()); + + charset = doc.OutputSettings().Encoding.WebName.ToUpperInvariant(); // update charset from meta-equiv, possibly + return doc; + } + + public string Body() + { + if (!executed) + { + throw new InvalidOperationException("Request must be executed (with .Execute(), .Get(), or .Post() before getting response body"); + } + + // charset gets set from header on execute, and from meta-equiv on parse. parse may not have happened yet + return string.IsNullOrWhiteSpace(charset) ? DataUtil.DefaultEncoding.GetString(byteData) : + Encoding.GetEncoding(charset).GetString(byteData); + } + + public byte[] BodyAsBytes() + { + if (!executed) + { + throw new InvalidOperationException("Request must be executed (with .Execute(), .Get(), or .Post() before getting response body"); + } + return byteData; + } + + // set up connection defaults, and details from request + private static HttpWebRequest CreateConnection(IRequest req) + { + var conn = (HttpWebRequest)HttpWebRequest.Create(req.Url()); + conn.Method = req.Method().ToString(); + conn.AllowAutoRedirect = false; // don't rely on native redirection support + conn.Timeout = req.Timeout(); + conn.ReadWriteTimeout = req.Timeout(); + + if (req.Cookies().Count > 0) + { + conn.Headers.Add(HttpRequestHeader.Cookie, GetRequestCookieString(req)); + } + + // Added due to incosistent behavior by .NET when trying to add this header. + if (req.HasHeader("Referer")) + { + conn.Referer = req.Header("Referer"); + req.RemoveHeader("Referer"); + } + + // Same as above. + if (req.HasHeader("User-Agent")) + { + conn.UserAgent = req.Header("User-Agent"); + req.RemoveHeader("User-Agent"); + } + + foreach (KeyValuePair header in req.Headers()) + { + conn.Headers.Add(header.Key, header.Value); + } + + return conn; + } + + + + // set up url, method, header, cookies + private void SetupFromConnection(HttpWebResponse conn, IResponse previousResponse) + { + + method = (Method)Enum.Parse(typeof(Method), conn.Method, true); + + url = conn.ResponseUri; + statusCode = conn.StatusCode; + statusMessage = conn.StatusDescription; + contentType = conn.ContentType; + + // headers into map + var resHeaders = conn.Headers; + + ProcessResponseHeaders(resHeaders); + + // if from a redirect, map previous response cookies into this response + if (previousResponse != null) + { + foreach (var prevCookie in previousResponse.Cookies()) + { + if (!HasCookie(prevCookie.Key)) + { + Cookie(prevCookie.Key, prevCookie.Value); + } + } + } + } + + public void ProcessResponseHeaders(WebHeaderCollection resHeaders) + { + foreach (string name in resHeaders.Keys) + { + if (string.IsNullOrWhiteSpace(name)) + { + continue; // http/1.1 line + } + + var value = resHeaders[name]; //.Split(';'); + + if (name.Equals("Set-Cookie", StringComparison.InvariantCultureIgnoreCase)) + { + var values = resHeaders["Set-Cookie"].Split(';', ','); + foreach (string v in values) + { + if (string.IsNullOrWhiteSpace(v)) + { + continue; + } + + var cd = new TokenQueue(v); + var cookieName = cd.ChompTo("=").Trim(); + var cookieVal = cd.ConsumeTo(";").Trim(); + + if (cookieVal == null) + { + cookieVal = string.Empty; + } + + // ignores path, date, domain, secure et al. req'd? + if (StringUtil.In(cookieName.ToLowerInvariant(), "domain", "path", "expires", "max-age", "secure", "httponly")) + { + // This is added for NSoup, since we do headers a bit differently around here. + continue; + } + + // name not blank, value not null + if (!string.IsNullOrEmpty(cookieName)) + { + Cookie(cookieName, cookieVal); + } + } + } + else + { + if (!string.IsNullOrEmpty(value)) + { + Header(name, /*values[0]*/ value); + } + } + } + } + + private static void WritePost(ICollection data, Stream outputStream) + { + var sb = new StringBuilder(); + + var first = true; + foreach (var keyVal in data) + { + if (!first) + { + sb.Append('&'); + } + else + { + first = false; + } + + sb.Append(HttpUtility.UrlEncode(keyVal.Key(), DataUtil.DefaultEncoding)) + .Append('=') + .Append(HttpUtility.UrlEncode(keyVal.Value(), DataUtil.DefaultEncoding)); + } + + var bytes = DataUtil.DefaultEncoding.GetBytes(sb.ToString()); + + outputStream.Write(bytes, 0, bytes.Length); + outputStream.Close(); + } + + private static string GetRequestCookieString(IRequest req) + { + var sb = new StringBuilder(); + var first = true; + foreach (var cookie in req.Cookies()) + { + if (!first) + { + sb.Append("; "); + } + else + { + first = false; + } + sb.Append(cookie.Key).Append('=').Append(cookie.Value); + // todo: spec says only ascii, no escaping / encoding defined. validate on set? or escape somehow here? + } + return sb.ToString(); + } + + // for get url reqs, serialise the data map into the url + private static void SerialiseRequestUrl(IRequest req) + { + var input = req.Url(); + var url = new StringBuilder(); + var first = true; + // reconstitute the query, ready for appends + url.Append(input.Scheme).Append("://").Append(input.Authority) + .Append(input.AbsolutePath).Append("?"); + + if (!string.IsNullOrEmpty(input.Query)) + { + url.Append(input.Query); + first = false; + } + + foreach (var keyVal in req.Data()) + { + if (!first) + { + url.Append('&'); + } + else + { + first = false; + } + + url.Append(HttpUtility.UrlEncode(keyVal.Key(), DataUtil.DefaultEncoding)) + .Append('=').Append(HttpUtility.UrlEncode(keyVal.Value(), DataUtil.DefaultEncoding)); + } + + req.Url(new Uri(url.ToString())); + req.Data().Clear(); // moved into url as get params + } + + public int NumRedirects + { + get + { + return numRedirects; + } + } + } + + public class Request : ConnectionBase, IRequest + { + private int timeoutMilliseconds; + private int maxBodySizeBytes; + private bool followRedirects; + private ICollection data; + private bool ignoreHttpErrors = false; + private bool ignoreContentType = false; + private Parser parser; + private bool parserDefined = false; // called parser(...) vs initialized in ctor + private bool validateTSLCertificates = true; + private string postDataCharset = DataUtil.DefaultEncoding.ToString(); + + public Request() + { + timeoutMilliseconds = 3000; + maxBodySizeBytes = 1024 * 1024; + followRedirects = true; + data = new List(); + method = NSoup.Method.Get; + headers["Accept-Encoding"] = "gzip"; + parser = Parse.Parser.HtmlParser(); + } + + public int Timeout() + { + return timeoutMilliseconds; + } + + public IRequest Timeout(int millis) + { + if (millis < 0) + { + throw new ArgumentOutOfRangeException("Timeout milliseconds must be 0 (infinite) or greater"); + } + + timeoutMilliseconds = millis; + + return this; + } + + public int MaxBodySize() + { + return maxBodySizeBytes; + } + + public IRequest MaxBodySize(int bytes) + { + if (bytes < 0) + { + throw new ArgumentOutOfRangeException("Max Size must be 0 (infinite) or greater"); + } + + maxBodySizeBytes = bytes; + return this; + } + + public bool FollowRedirects() + { + return followRedirects; + } + + public IRequest FollowRedirects(bool followRedirects) + { + this.followRedirects = followRedirects; + return this; + } + + public bool IgnoreHttpErrors() + { + return ignoreHttpErrors; + } + + public IRequest IgnoreHttpErrors(bool ignoreHttpErrors) + { + this.ignoreHttpErrors = ignoreHttpErrors; + return this; + } + + public bool ValidateTLSCertificates() + { + return validateTSLCertificates; + } + + public void ValidateTLSCertificates(bool value) + { + validateTSLCertificates = value; + } + + public bool IgnoreContentType() + { + return ignoreContentType; + } + + public IRequest IgnoreContentType(bool ignoreContentType) + { + this.ignoreContentType = ignoreContentType; + return this; + } + + public IRequest Data(KeyVal keyval) + { + if (keyval == null) + { + throw new ArgumentNullException("keyval"); + } + + data.Add(keyval); + + return this; + } + + public ICollection Data() + { + return data; + } + + public IRequest Parser(Parser parser) + { + this.parser = parser; + return this; + } + + public Parser Parser() + { + return parser; + } + } + + public class KeyVal : IKeyVal + { + private string key; + private string value; + private Stream stream; + + public static KeyVal Create(string key, string value) + { + if (string.IsNullOrEmpty(key)) + { + throw new ArgumentException("Data key must not be empty", "key"); + } + + if (value == null) + { + throw new ArgumentNullException("value", "Data value must not be null"); + } + + return new KeyVal(key, value); + } + + public static KeyVal Create(string key, string value, Stream stream) + { + if (string.IsNullOrEmpty(key)) + { + throw new ArgumentException("Data key must not be empty", "key"); + } + + if (value == null) + { + throw new ArgumentNullException("value", "Data value must not be null"); + } + + return new KeyVal(key, value, stream); + } + + private KeyVal(string key, string value) + { + this.key = key; + this.value = value; + } + + private KeyVal(string key, string value, Stream stream) + { + this.key = key; + this.value = value; + this.stream = stream; + } + + #region IKeyVal Members + + public IKeyVal Key(string key) + { + if (string.IsNullOrEmpty(key)) + { + throw new ArgumentException("Data key must not be empty", "key"); + } + + this.key = key; + + return this; + } + + public string Key() + { + return key; + } + + public IKeyVal Value(string value) + { + if (value == null) + { + throw new ArgumentNullException("value", "Data value must not be null"); + } + + this.value = value; + + return this; + } + + public string Value() + { + return value; + } + + public IKeyVal InputStream(Stream inputStream) + { + if (inputStream == null) + { + throw new ArgumentNullException("inputStream", "Data input stream must not be null"); + } + + this.stream = inputStream; + return this; + } + + public Stream InputStream() + { + return stream; + } + + public bool HasInputStream() + { + return stream != null; + } + + #endregion + + public override string ToString() + { + return string.Concat(key, "=", value); + } + } } \ No newline at end of file diff --git a/NSoup/Helper/LinkedHashSet.cs b/NSoup/Helper/LinkedHashSet.cs index 06bdab6..d07ab25 100644 --- a/NSoup/Helper/LinkedHashSet.cs +++ b/NSoup/Helper/LinkedHashSet.cs @@ -1,8 +1,7 @@ using System; +using System.Collections; using System.Collections.Generic; using System.Linq; -using System.Text; -using System.Collections; namespace NSoup { @@ -148,7 +147,7 @@ public bool Contains(T item) return _hashSet.Contains(item); } - + /// /// Copies the elements of a NSoup.LinkedHashSet object to /// an array, starting at the specified array index. diff --git a/NSoup/Helper/StringUtil.cs b/NSoup/Helper/StringUtil.cs index e492a1f..e392b2f 100644 --- a/NSoup/Helper/StringUtil.cs +++ b/NSoup/Helper/StringUtil.cs @@ -5,163 +5,163 @@ namespace NSoup.Helper { - /** + /** * A minimal String utility class. Designed for internal jsoup use only. */ - public static class StringUtil - { - // memoised padding up to 10 - private static readonly string[] padding = { "", " ", " ", " ", " ", " ", " ", " ", " ", " ", " " }; - - /// - /// Join a collection of strings by a seperator - /// - /// collection of string objects - /// string to place between strings - /// joined string - public static string Join(this ICollection strings, string sep) - { - return string.Join(sep, strings.ToArray()); - } - - public static string Join(IEnumerator iterator, string sep) - { - if (!iterator.MoveNext()) - { - return string.Empty; - } - - var start = iterator.Current; - if (!iterator.MoveNext()) - { - return start; - } - - var sb = new StringBuilder(64).Append(start); - while (iterator.MoveNext()) - { - sb.Append(sep); - sb.Append(iterator.Current); - } - - return sb.ToString(); - } - - /// - /// Returns space padding - /// - /// amount of padding desired - /// string of spaces * width - public static string Padding(int width) - { - if (width < 0) - { - throw new ArgumentException("width must be > 0"); - } - - if (width < padding.Length) - { - return padding[width]; - } - - return string.Empty.PadLeft(width); - } - - public static bool IsBlank(this string s) - { - return string.IsNullOrWhiteSpace(s) ? true : s.Trim().Length == 0; - } - - public static bool IsNumeric(this string s) - { - if (string.IsNullOrEmpty(s)) - { - return false; - } - - var anyNonDigits = s.ToCharArray().Any(c => !char.IsDigit(c)); - return !(anyNonDigits); - } - - /// - /// Tests if a code point is "whitespace" as defined in the HTML spec. - /// - /// Code point to test - /// True if code point is whitespace, false otherwise - public static bool IsWhiteSpace(char c) - { - return c == ' ' || c == '\t' || c == '\n' || c == '\f' || c == '\r'; - } - - public static string NormaliseWhitespace(this string s) - { - var sb = new StringBuilder(s.Length); - - var lastWasWhite = false; - var reachedNonWhite = false; - - var l = s.Length; - for (var i = 0; i < l; i++) - { - var c = s[i]; - if (IsWhiteSpace(c)) - { - if (lastWasWhite) { continue; } - sb.Append(' '); - lastWasWhite = true; - } - else - { - sb.Append(c); - lastWasWhite = false; - reachedNonWhite = true; - } - } - - return sb.ToString(); - } - - public static bool In(string needle, params string[] haystack) - { - foreach (string hay in haystack) - { - if (hay.Equals(needle)) - { - return true; - } - } - return false; - } - - public static bool InSorted(string needle, params string[] haystack) - { - return Array.BinarySearch(haystack, needle) >= 0; - } - - public static Uri Resolve(Uri url, string relUrl) - { - Uri resultUri = null; - if (relUrl.IndexOf('.') == 0 && url.PathAndQuery.IndexOf('/') != 0) - { - url = new Uri(url.Scheme + url.Host + url.Port + "/" + url.PathAndQuery); - } - - Uri.TryCreate(url, relUrl, out resultUri); - return resultUri; - } - - public static string Resolve(string url, string relUrl) - { - Uri baseUri = null; - var validUri = Uri.TryCreate(url, UriKind.RelativeOrAbsolute, out baseUri); - if (validUri) - { - var resultUri = Resolve(baseUri, relUrl); - return resultUri == null ? string.Empty : resultUri.ToString(); - } - - validUri = Uri.TryCreate(relUrl, UriKind.RelativeOrAbsolute, out baseUri); - return baseUri == null ? string.Empty : baseUri.ToString(); - } - } + public static class StringUtil + { + // memoised padding up to 10 + private static readonly string[] padding = { "", " ", " ", " ", " ", " ", " ", " ", " ", " ", " " }; + + /// + /// Join a collection of strings by a seperator + /// + /// collection of string objects + /// string to place between strings + /// joined string + public static string Join(this ICollection strings, string sep) + { + return string.Join(sep, strings.ToArray()); + } + + public static string Join(IEnumerator iterator, string sep) + { + if (!iterator.MoveNext()) + { + return string.Empty; + } + + var start = iterator.Current; + if (!iterator.MoveNext()) + { + return start; + } + + var sb = new StringBuilder(64).Append(start); + while (iterator.MoveNext()) + { + sb.Append(sep); + sb.Append(iterator.Current); + } + + return sb.ToString(); + } + + /// + /// Returns space padding + /// + /// amount of padding desired + /// string of spaces * width + public static string Padding(int width) + { + if (width < 0) + { + throw new ArgumentException("width must be > 0"); + } + + if (width < padding.Length) + { + return padding[width]; + } + + return string.Empty.PadLeft(width); + } + + public static bool IsBlank(this string s) + { + return string.IsNullOrWhiteSpace(s) ? true : s.Trim().Length == 0; + } + + public static bool IsNumeric(this string s) + { + if (string.IsNullOrEmpty(s)) + { + return false; + } + + var anyNonDigits = s.ToCharArray().Any(c => !char.IsDigit(c)); + return !(anyNonDigits); + } + + /// + /// Tests if a code point is "whitespace" as defined in the HTML spec. + /// + /// Code point to test + /// True if code point is whitespace, false otherwise + public static bool IsWhiteSpace(char c) + { + return c == ' ' || c == '\t' || c == '\n' || c == '\f' || c == '\r'; + } + + public static string NormaliseWhitespace(this string s) + { + var sb = new StringBuilder(s.Length); + + var lastWasWhite = false; + var reachedNonWhite = false; + + var l = s.Length; + for (var i = 0; i < l; i++) + { + var c = s[i]; + if (IsWhiteSpace(c)) + { + if (lastWasWhite) { continue; } + sb.Append(' '); + lastWasWhite = true; + } + else + { + sb.Append(c); + lastWasWhite = false; + reachedNonWhite = true; + } + } + + return sb.ToString(); + } + + public static bool In(string needle, params string[] haystack) + { + foreach (string hay in haystack) + { + if (hay.Equals(needle)) + { + return true; + } + } + return false; + } + + public static bool InSorted(string needle, params string[] haystack) + { + return Array.BinarySearch(haystack, needle) >= 0; + } + + public static Uri Resolve(Uri url, string relUrl) + { + Uri resultUri = null; + if (relUrl.IndexOf('.') == 0 && url.PathAndQuery.IndexOf('/') != 0) + { + url = new Uri(url.Scheme + url.Host + url.Port + "/" + url.PathAndQuery); + } + + Uri.TryCreate(url, relUrl, out resultUri); + return resultUri; + } + + public static string Resolve(string url, string relUrl) + { + Uri baseUri = null; + var validUri = Uri.TryCreate(url, UriKind.RelativeOrAbsolute, out baseUri); + if (validUri) + { + var resultUri = Resolve(baseUri, relUrl); + return resultUri == null ? string.Empty : resultUri.ToString(); + } + + validUri = Uri.TryCreate(relUrl, UriKind.RelativeOrAbsolute, out baseUri); + return baseUri == null ? string.Empty : baseUri.ToString(); + } + } } \ No newline at end of file diff --git a/NSoup/HttpStatusException.cs b/NSoup/HttpStatusException.cs index 5112519..71abfe1 100644 --- a/NSoup/HttpStatusException.cs +++ b/NSoup/HttpStatusException.cs @@ -1,8 +1,4 @@ -using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Text; +using System.IO; namespace NSoup { @@ -36,5 +32,5 @@ public override string ToString() return base.ToString() + ". Status=" + StatusCode + ", URL=" + Url; } } -} - +} + diff --git a/NSoup/IConnection.cs b/NSoup/IConnection.cs index 7c6e6cb..f5d5b62 100644 --- a/NSoup/IConnection.cs +++ b/NSoup/IConnection.cs @@ -1,521 +1,519 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; +using NSoup.Helper; using NSoup.Nodes; -using System.Net; -using NSoup.Helper; using NSoup.Parse; +using System; +using System.Collections.Generic; using System.IO; +using System.Net; namespace NSoup { - /// - /// GET and POST http methods. - /// - public enum Method - { - Get, Post - } - - public interface IConnection - { - /// - /// Set the request URL to fetch. The protocol must be HTTP or HTTPS. - /// - /// URL to connect to - /// this IConnection, for chaining - IConnection Url(Uri url); - - /// - /// Set the request URL to fetch. The protocol must be HTTP or HTTPS. - /// - /// URL to connect to - /// this IConnection, for chaining - IConnection Url(string url); - - /// - /// Set the request user-agent header. - /// - /// user-agent to use - /// this IConnection, for chaining - IConnection UserAgent(string userAgent); - - /// - /// Set the request timeouts (connect and read). If a timeout occurs, an IOException will be thrown. The default - /// timeout is 3 seconds (3000 millis). A timeout of zero is treated as an infinite timeout. - /// - /// number of milliseconds (thousandths of a second) before timing out connects or reads. - /// this IConnection, for chaining - IConnection Timeout(int millis); - - /// - /// Set the request referrer (aka "referer") header. - /// - /// referrer to use - /// this IConnection, for chaining - IConnection Referrer(string referrer); - - /// - /// Configures the connection to (not) follow server redirects. By default this is true. - /// - /// true if server redirects should be followed. - /// this IConnection, for chaining - IConnection FollowRedirects(bool followRedirects); - - /// - /// Set the request method to use, GET or POST. Default is GET. - /// - /// HTTP request method - /// this IConnection, for chaining - IConnection Method(Method method); - - /// - /// Configures the connection to not throw exceptions when a HTTP error occurs. (4xx - 5xx, e.g. 404 or 500). By - /// default this is false; an IOException is thrown if an error is encountered. If set to true, the - /// response is populated with the error body, and the status message will reflect the error. - /// - /// false (default) if HTTP errors should be ignored - /// this IConnection, for chaining - IConnection IgnoreHttpErrors(bool ignoreHttpErrors); - - /// - /// Ignore the document's Content-Type when parsing the response. By default this is false, an unrecognised - /// content-type will cause an IOException to be thrown. (This is to prevent producing garbage by attempting to parse - /// a JPEG binary image, for example.) Set to true to force a parse attempt regardless of content type. - /// - /// set to true if you would like the content type ignored on parsing the response into a Document - /// this IConnection, for chaining - IConnection IgnoreContentType(bool ignoreContentType); - - /// - /// Add a request data parameter. Request parameters are sent in the request query string for GETs, and in the request - /// body for POSTs. A request may have multiple values of the same name. - /// - /// data key - /// data value - /// this IConnection, for chaining - IConnection Data(string key, string value); - - /// - /// Adds all of the supplied data to the request data parameters - /// - /// dictionary of data parameters - /// this IConnection, for chaining - IConnection Data(IDictionary data); - - /// - /// Add a number of request data parameters. Multiple parameters may be set at once, e.g.: - /// .data("name", "jsoup", "language", "Java", "language", "English"); creates a query string like: - /// ?name=jsoup&language=Java&language=English - /// - /// a set of key value pairs. - /// this IConnection, for chaining - IConnection Data(params string[] keyvals); - - /// - /// Set a request header. - /// - /// header name - /// header value - /// this IConnection, for chaining - /// - IConnection Header(string name, string value); - - /// - /// Set a cookie to be sent in the request. - /// - /// name of cookie - /// value of cookie - /// this IConnection, for chaining - IConnection Cookie(string name, string value); - - /// - /// Adds each of the supplied cookies to the request. - /// - /// Map of cookie name -> value pairs - /// This Connection, for chaining - IConnection Cookies(IDictionary cookies); - - /// - /// Provide an alternate parser to use when parsing the response to a Document. - /// - /// Alternate parser - /// This Connection, for chaining - IConnection Parser(Parser parser); - - /// - /// Execute the request as a GET, and parse the result. - /// - /// Parsed Document - /// If the response is not OK and HTTP response errors are not ignored - /// If the response mime type is not supported and those errors are not ignored - /// //@throws java.net.MalformedURLException if the request URL is not a HTTP or HTTPS URL, or is otherwise malformed - /// //@throws java.net.SocketTimeoutException if the connection times out - /// On error - Document Get(); - - /// - /// Execute the request as a POST, and parse the result. - /// - /// Parsed Document - /// If the response is not OK and HTTP response errors are not ignored - /// If the response mime type is not supported and those errors are not ignored - /// //@throws java.net.MalformedURLException if the request URL is not a HTTP or HTTPS URL, or is otherwise malformed - /// //@throws java.net.SocketTimeoutException if the connection times out - /// On error - Document Post(); - - /// - /// Execute the request. - /// - /// a response object - /// If the response is not OK and HTTP response errors are not ignored - /// If the response mime type is not supported and those errors are not ignored - /// //@throws java.net.MalformedURLException if the request URL is not a HTTP or HTTPS URL, or is otherwise malformed - /// //@throws java.net.SocketTimeoutException if the connection times out - /// On error - IResponse Execute(); - - /// - /// Get the request object associated with this IConnection - /// - /// request - IRequest Request(); - - /// - /// Set the IConnection's request - /// - /// new request object - /// this IConnection, for chaining - IConnection Request(IRequest request); - - /// - /// Get the response, once the request has been executed - /// - /// response - IResponse Response(); - - /// - /// Set the connection's response - /// - /// new response - /// this IConnection, for chaining - IConnection Response(IResponse response); - } - - /// - /// Common methods for Requests and Responses - /// - /// Type of IConnectionBase, either Request or Response - public interface IConnectionBase where T : IConnectionBase - { - - /// - /// Gets the URL - /// - /// URL - Uri Url(); - - /// - /// Sets the URL - /// - /// new URL - /// this, for chaining - IConnectionBase Url(Uri url); - - /// - /// Gets the request method - /// - /// method - Method Method(); - - /// - /// Sets the request method - /// - /// new method - /// this, for chaining - IConnectionBase Method(Method method); - - /// - /// Gets the value of a header. This is a simplified header model, where a header may only have one value. - /// Header names are case insensitive. - /// - /// name of header (case insensitive) - /// value of header, or null if not set. - /// - /// - string Header(string name); - - /// - /// Sets a header. This method will overwrite any existing header with the same case insensitive name. - /// - /// Name of header - /// Value of header - /// this, for chaining - IConnectionBase Header(string name, string value); - - /// - /// Check if a header is present - /// - /// name of header (case insensitive) - /// if the header is present in this request/response - bool HasHeader(string name); - - /// - /// Remove a header by name - /// - /// name of header to remove (case insensitive) - /// this, for chaining - IConnectionBase RemoveHeader(string name); - - /// - /// Retrieve all of the request/response headers as a map - /// - /// headers - IDictionary Headers(); - - /// - /// Gets a cookie value by name from this request/response. - /// Response objects have a simplified cookie model. Each cookie set in the response is added to the response - /// object's cookie key=value map. The cookie's path, domain, and expiry date are ignored. - /// - /// name of cookie to retrieve. - /// value of cookie, or null if not set - string Cookie(string name); - - /// - /// Sets a cookie in this request/response. - /// - /// name of cookie - /// value of cookie - /// this, for chaining - IConnectionBase Cookie(string name, string value); - - /// - /// Check if a cookie is present - /// - /// name of cookie - /// if the cookie is present in this request/response - bool HasCookie(string name); - - /// - /// Remove a cookie by name - /// - /// name of cookie to remove - /// this, for chaining - IConnectionBase RemoveCookie(string name); - - /// - /// Retrieve all of the request/response cookies as a map - /// - /// cookies - IDictionary Cookies(); - - } - - /// - /// Represents a HTTP request. - /// - public interface IRequest : IConnectionBase - { - - /// - /// Gets the request timeout, in milliseconds. - /// - /// the timeout in milliseconds. - int Timeout(); - - /// - /// Update the request timeout. - /// - /// timeout, in milliseconds - /// this Request, for chaining - IRequest Timeout(int millis); - - /// - /// Update the maximum allowed body size - /// - /// bytes, as an integer - /// this Rquest for chaining - IRequest MaxBodySize(int bytes); - - /// - /// Get the current followRedirects configuration. - /// - /// true if followRedirects is enabled. - bool FollowRedirects(); - - /// - /// Configures the request to (not) follow server redirects. By default this is true. - /// - /// true if server redirects should be followed. - /// This IRequest, for chaining - IRequest FollowRedirects(bool followRedirects); - - /// - /// Gets the current IgnoreHttpErrors configuration. - /// - /// true if errors will be ignored; false (default) if HTTP errors will cause an IOException to be thrown - bool IgnoreHttpErrors(); - - /// - /// Configures the request to ignore HTTP errors in the response. - /// - /// set to true to ignore HTTP errors. - /// This IRequest, for chaining - IRequest IgnoreHttpErrors(bool ignoreHttpErrors); - - /// - /// Configure the request to validate TLS Certificates - /// - /// set to true to validate tls certs - /// this IRequest for chaining - void ValidateTLSCertificates(bool value); - /// - /// Gets the current IgnoreContentType configuration. - /// - /// true if invalid content-types will be ignored; false (default) if they will cause an IOException to be thrown - bool IgnoreContentType(); - - /// - /// Configures the request to ignore the Content-Type of the response. - /// - /// set to true to ignore the contenet type - /// This IRequest, for chaining - IRequest IgnoreContentType(bool ignoreContentType); - - /// - /// Add a data parameter to the request - /// - /// data to add. - /// this Request, for chaining - IRequest Data(KeyVal keyval); - - /// - /// Get all of the request's data parameters - /// - /// collection of keyvals - ICollection Data(); - - /// - /// Specify the parser to use when parsing the document. - /// - /// Parser to use. - /// This IRequest, for chaining - IRequest Parser(Parser parser); - - /// - /// Get the current parser to use when parsing the document. - /// - /// Current Parser - Parser Parser(); - } - - /// - /// Represents a HTTP response. - /// - public interface IResponse : IConnectionBase - { - - /// - /// Gets the status code of the response. - /// - /// status code - HttpStatusCode StatusCode(); - - /// - /// Gets the status message of the response. - /// - /// status message - string StatusMessage(); - - /// - /// Gets the character set name of the response. - /// - /// character set name - string Charset(); - - /// - /// Gets the response content type (e.g. "text/html"); - /// - /// the response content type - string ContentType(); - - /// - /// Parse the body of the response as a Document. - /// - /// a parsed Document - /// - Document Parse(); - - /// - /// Gets the body of the response as a plain string. - /// - /// body - string Body(); - - /// - /// Gets the body of the response as an array of bytes. - /// - /// body bytes - byte[] BodyAsBytes(); - - /// - /// Gets number of redirects. - /// - int NumRedirects { get; } - } - - /// - /// A Key Value tuple. - /// - public interface IKeyVal - { - - /// - /// Update the key of a keyval - /// - /// new key - /// this KeyVal, for chaining - IKeyVal Key(string key); - - /// - /// Gets the key of a keyval - /// - /// the key - string Key(); - - /// - /// Update the value of a keyval - /// - /// the new value - /// this KeyVal, for chaining - IKeyVal Value(string value); - - /// - /// Gets the value of a keyval - /// - /// the value - string Value(); - - /// - /// Update stream of a keyval - /// - /// - /// - IKeyVal InputStream(Stream inputStream); - - /// - /// Get the stream of a keyval - /// - /// - Stream InputStream(); - - /// - /// Determine if keyval has a stream - /// - /// - bool HasInputStream(); + /// + /// GET and POST http methods. + /// + public enum Method + { + Get, Post + } + + public interface IConnection + { + /// + /// Set the request URL to fetch. The protocol must be HTTP or HTTPS. + /// + /// URL to connect to + /// this IConnection, for chaining + IConnection Url(Uri url); + + /// + /// Set the request URL to fetch. The protocol must be HTTP or HTTPS. + /// + /// URL to connect to + /// this IConnection, for chaining + IConnection Url(string url); + + /// + /// Set the request user-agent header. + /// + /// user-agent to use + /// this IConnection, for chaining + IConnection UserAgent(string userAgent); + + /// + /// Set the request timeouts (connect and read). If a timeout occurs, an IOException will be thrown. The default + /// timeout is 3 seconds (3000 millis). A timeout of zero is treated as an infinite timeout. + /// + /// number of milliseconds (thousandths of a second) before timing out connects or reads. + /// this IConnection, for chaining + IConnection Timeout(int millis); + + /// + /// Set the request referrer (aka "referer") header. + /// + /// referrer to use + /// this IConnection, for chaining + IConnection Referrer(string referrer); + + /// + /// Configures the connection to (not) follow server redirects. By default this is true. + /// + /// true if server redirects should be followed. + /// this IConnection, for chaining + IConnection FollowRedirects(bool followRedirects); + + /// + /// Set the request method to use, GET or POST. Default is GET. + /// + /// HTTP request method + /// this IConnection, for chaining + IConnection Method(Method method); + + /// + /// Configures the connection to not throw exceptions when a HTTP error occurs. (4xx - 5xx, e.g. 404 or 500). By + /// default this is false; an IOException is thrown if an error is encountered. If set to true, the + /// response is populated with the error body, and the status message will reflect the error. + /// + /// false (default) if HTTP errors should be ignored + /// this IConnection, for chaining + IConnection IgnoreHttpErrors(bool ignoreHttpErrors); + + /// + /// Ignore the document's Content-Type when parsing the response. By default this is false, an unrecognised + /// content-type will cause an IOException to be thrown. (This is to prevent producing garbage by attempting to parse + /// a JPEG binary image, for example.) Set to true to force a parse attempt regardless of content type. + /// + /// set to true if you would like the content type ignored on parsing the response into a Document + /// this IConnection, for chaining + IConnection IgnoreContentType(bool ignoreContentType); + + /// + /// Add a request data parameter. Request parameters are sent in the request query string for GETs, and in the request + /// body for POSTs. A request may have multiple values of the same name. + /// + /// data key + /// data value + /// this IConnection, for chaining + IConnection Data(string key, string value); + + /// + /// Adds all of the supplied data to the request data parameters + /// + /// dictionary of data parameters + /// this IConnection, for chaining + IConnection Data(IDictionary data); + + /// + /// Add a number of request data parameters. Multiple parameters may be set at once, e.g.: + /// .data("name", "jsoup", "language", "Java", "language", "English"); creates a query string like: + /// ?name=jsoup&language=Java&language=English + /// + /// a set of key value pairs. + /// this IConnection, for chaining + IConnection Data(params string[] keyvals); + + /// + /// Set a request header. + /// + /// header name + /// header value + /// this IConnection, for chaining + /// + IConnection Header(string name, string value); + + /// + /// Set a cookie to be sent in the request. + /// + /// name of cookie + /// value of cookie + /// this IConnection, for chaining + IConnection Cookie(string name, string value); + + /// + /// Adds each of the supplied cookies to the request. + /// + /// Map of cookie name -> value pairs + /// This Connection, for chaining + IConnection Cookies(IDictionary cookies); + + /// + /// Provide an alternate parser to use when parsing the response to a Document. + /// + /// Alternate parser + /// This Connection, for chaining + IConnection Parser(Parser parser); + + /// + /// Execute the request as a GET, and parse the result. + /// + /// Parsed Document + /// If the response is not OK and HTTP response errors are not ignored + /// If the response mime type is not supported and those errors are not ignored + /// //@throws java.net.MalformedURLException if the request URL is not a HTTP or HTTPS URL, or is otherwise malformed + /// //@throws java.net.SocketTimeoutException if the connection times out + /// On error + Document Get(); + + /// + /// Execute the request as a POST, and parse the result. + /// + /// Parsed Document + /// If the response is not OK and HTTP response errors are not ignored + /// If the response mime type is not supported and those errors are not ignored + /// //@throws java.net.MalformedURLException if the request URL is not a HTTP or HTTPS URL, or is otherwise malformed + /// //@throws java.net.SocketTimeoutException if the connection times out + /// On error + Document Post(); + + /// + /// Execute the request. + /// + /// a response object + /// If the response is not OK and HTTP response errors are not ignored + /// If the response mime type is not supported and those errors are not ignored + /// //@throws java.net.MalformedURLException if the request URL is not a HTTP or HTTPS URL, or is otherwise malformed + /// //@throws java.net.SocketTimeoutException if the connection times out + /// On error + IResponse Execute(); + + /// + /// Get the request object associated with this IConnection + /// + /// request + IRequest Request(); + + /// + /// Set the IConnection's request + /// + /// new request object + /// this IConnection, for chaining + IConnection Request(IRequest request); + + /// + /// Get the response, once the request has been executed + /// + /// response + IResponse Response(); + + /// + /// Set the connection's response + /// + /// new response + /// this IConnection, for chaining + IConnection Response(IResponse response); + } + + /// + /// Common methods for Requests and Responses + /// + /// Type of IConnectionBase, either Request or Response + public interface IConnectionBase where T : IConnectionBase + { + + /// + /// Gets the URL + /// + /// URL + Uri Url(); + + /// + /// Sets the URL + /// + /// new URL + /// this, for chaining + IConnectionBase Url(Uri url); + + /// + /// Gets the request method + /// + /// method + Method Method(); + + /// + /// Sets the request method + /// + /// new method + /// this, for chaining + IConnectionBase Method(Method method); + + /// + /// Gets the value of a header. This is a simplified header model, where a header may only have one value. + /// Header names are case insensitive. + /// + /// name of header (case insensitive) + /// value of header, or null if not set. + /// + /// + string Header(string name); + + /// + /// Sets a header. This method will overwrite any existing header with the same case insensitive name. + /// + /// Name of header + /// Value of header + /// this, for chaining + IConnectionBase Header(string name, string value); + + /// + /// Check if a header is present + /// + /// name of header (case insensitive) + /// if the header is present in this request/response + bool HasHeader(string name); + + /// + /// Remove a header by name + /// + /// name of header to remove (case insensitive) + /// this, for chaining + IConnectionBase RemoveHeader(string name); + + /// + /// Retrieve all of the request/response headers as a map + /// + /// headers + IDictionary Headers(); + + /// + /// Gets a cookie value by name from this request/response. + /// Response objects have a simplified cookie model. Each cookie set in the response is added to the response + /// object's cookie key=value map. The cookie's path, domain, and expiry date are ignored. + /// + /// name of cookie to retrieve. + /// value of cookie, or null if not set + string Cookie(string name); + + /// + /// Sets a cookie in this request/response. + /// + /// name of cookie + /// value of cookie + /// this, for chaining + IConnectionBase Cookie(string name, string value); + + /// + /// Check if a cookie is present + /// + /// name of cookie + /// if the cookie is present in this request/response + bool HasCookie(string name); + + /// + /// Remove a cookie by name + /// + /// name of cookie to remove + /// this, for chaining + IConnectionBase RemoveCookie(string name); + + /// + /// Retrieve all of the request/response cookies as a map + /// + /// cookies + IDictionary Cookies(); + + } + + /// + /// Represents a HTTP request. + /// + public interface IRequest : IConnectionBase + { + + /// + /// Gets the request timeout, in milliseconds. + /// + /// the timeout in milliseconds. + int Timeout(); + + /// + /// Update the request timeout. + /// + /// timeout, in milliseconds + /// this Request, for chaining + IRequest Timeout(int millis); + + /// + /// Update the maximum allowed body size + /// + /// bytes, as an integer + /// this Rquest for chaining + IRequest MaxBodySize(int bytes); + + /// + /// Get the current followRedirects configuration. + /// + /// true if followRedirects is enabled. + bool FollowRedirects(); + + /// + /// Configures the request to (not) follow server redirects. By default this is true. + /// + /// true if server redirects should be followed. + /// This IRequest, for chaining + IRequest FollowRedirects(bool followRedirects); + + /// + /// Gets the current IgnoreHttpErrors configuration. + /// + /// true if errors will be ignored; false (default) if HTTP errors will cause an IOException to be thrown + bool IgnoreHttpErrors(); + + /// + /// Configures the request to ignore HTTP errors in the response. + /// + /// set to true to ignore HTTP errors. + /// This IRequest, for chaining + IRequest IgnoreHttpErrors(bool ignoreHttpErrors); + + /// + /// Configure the request to validate TLS Certificates + /// + /// set to true to validate tls certs + /// this IRequest for chaining + void ValidateTLSCertificates(bool value); + /// + /// Gets the current IgnoreContentType configuration. + /// + /// true if invalid content-types will be ignored; false (default) if they will cause an IOException to be thrown + bool IgnoreContentType(); + + /// + /// Configures the request to ignore the Content-Type of the response. + /// + /// set to true to ignore the contenet type + /// This IRequest, for chaining + IRequest IgnoreContentType(bool ignoreContentType); + + /// + /// Add a data parameter to the request + /// + /// data to add. + /// this Request, for chaining + IRequest Data(KeyVal keyval); + + /// + /// Get all of the request's data parameters + /// + /// collection of keyvals + ICollection Data(); + + /// + /// Specify the parser to use when parsing the document. + /// + /// Parser to use. + /// This IRequest, for chaining + IRequest Parser(Parser parser); + + /// + /// Get the current parser to use when parsing the document. + /// + /// Current Parser + Parser Parser(); + } + + /// + /// Represents a HTTP response. + /// + public interface IResponse : IConnectionBase + { + + /// + /// Gets the status code of the response. + /// + /// status code + HttpStatusCode StatusCode(); + + /// + /// Gets the status message of the response. + /// + /// status message + string StatusMessage(); + + /// + /// Gets the character set name of the response. + /// + /// character set name + string Charset(); + + /// + /// Gets the response content type (e.g. "text/html"); + /// + /// the response content type + string ContentType(); + + /// + /// Parse the body of the response as a Document. + /// + /// a parsed Document + /// + Document Parse(); + + /// + /// Gets the body of the response as a plain string. + /// + /// body + string Body(); + + /// + /// Gets the body of the response as an array of bytes. + /// + /// body bytes + byte[] BodyAsBytes(); + + /// + /// Gets number of redirects. + /// + int NumRedirects { get; } + } + + /// + /// A Key Value tuple. + /// + public interface IKeyVal + { + + /// + /// Update the key of a keyval + /// + /// new key + /// this KeyVal, for chaining + IKeyVal Key(string key); + + /// + /// Gets the key of a keyval + /// + /// the key + string Key(); + + /// + /// Update the value of a keyval + /// + /// the new value + /// this KeyVal, for chaining + IKeyVal Value(string value); + + /// + /// Gets the value of a keyval + /// + /// the value + string Value(); + + /// + /// Update stream of a keyval + /// + /// + /// + IKeyVal InputStream(Stream inputStream); + + /// + /// Get the stream of a keyval + /// + /// + Stream InputStream(); + + /// + /// Determine if keyval has a stream + /// + /// + bool HasInputStream(); } } \ No newline at end of file diff --git a/NSoup/NSoup.cs b/NSoup/NSoup.cs index b311cfe..b551766 100644 --- a/NSoup/NSoup.cs +++ b/NSoup/NSoup.cs @@ -1,12 +1,9 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; +using NSoup.Helper; using NSoup.Nodes; using NSoup.Parse; -using System.IO; using NSoup.Safety; -using NSoup.Helper; +using System; +using System.IO; namespace NSoup { diff --git a/NSoup/NSoup.csproj b/NSoup/NSoup.csproj index 7dd17fa..4474604 100644 --- a/NSoup/NSoup.csproj +++ b/NSoup/NSoup.csproj @@ -1,120 +1,23 @@ - - + + - Debug - AnyCPU - 9.0.21022 - 2.0 - {EA189DC2-2C8D-4B50-BEE8-8964D6BEDF33} - Library - Properties - NSoup - NSoup - v4.6 - 512 - true - NSoup.snk - - - - - 3.5 - + netstandard2.0 + false + 2.0.0 + true - - true - full - false - bin\Debug\ - DEBUG;TRACE - prompt - 4 - false - - - pdbonly + + true - bin\Release\ - TRACE - prompt - 4 - false + - - - 3.5 - - - - 3.5 - - - 3.5 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + - - - \ No newline at end of file + diff --git a/NSoup/NSoup.csproj.user b/NSoup/NSoup.csproj.user index fc3378a..b8840dc 100644 --- a/NSoup/NSoup.csproj.user +++ b/NSoup/NSoup.csproj.user @@ -2,5 +2,7 @@ + ShowAllFiles + false \ No newline at end of file diff --git a/NSoup/NSoup.snk b/NSoup/NSoup.snk deleted file mode 100644 index 1f47068c83910385c0d60c02ad1db4c331b05c40..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 596 zcmV-a0;~N80ssI2Bme+XQ$aES1ONa50096sj9{B=aXW!-R_MPluB2NPmSj{-6ucGl z%3XTry3iu&<$I1*a8gMYX{|r%F_+H!EX2)OL_Qlo<$PUUwayB2fo1-`IXDZWAmF~{ z;rCmi%*BILbKZJw{5zr|J%pHS965HpPzJ3~*>?LQoV*7trm*GDrk6lygMJ7-JwwjV zBd3ZrB)QEO4%!6KyT-Q6x*-UL4K09|^V9p`ptQJIA$;WSBKwOM9y?8q zhKgRa+FB8X /// A single key + value attribute. Keys are trimmed and normalised to lower-case. /// diff --git a/NSoup/Nodes/Attributes.cs b/NSoup/Nodes/Attributes.cs index 24c567d..d3fa94e 100644 --- a/NSoup/Nodes/Attributes.cs +++ b/NSoup/Nodes/Attributes.cs @@ -1,8 +1,8 @@ using System; using System.Collections.Generic; +using System.Collections.ObjectModel; using System.Linq; using System.Text; -using System.Collections.ObjectModel; namespace NSoup.Nodes { @@ -231,7 +231,7 @@ public override bool Equals(object obj) return true; } - if (that.attributes != null && + if (that.attributes != null && attributes != null && attributes.Keys.SequenceEqual(that.attributes.Keys) && attributes.Values.SequenceEqual(that.attributes.Values)) diff --git a/NSoup/Nodes/Comment.cs b/NSoup/Nodes/Comment.cs index d137ace..0a27e6c 100644 --- a/NSoup/Nodes/Comment.cs +++ b/NSoup/Nodes/Comment.cs @@ -1,7 +1,4 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; +using System.Text; namespace NSoup.Nodes { @@ -50,7 +47,7 @@ public override void OuterHtmlHead(StringBuilder accum, int depth, OutputSetting { Indent(accum, depth, output); } - + accum .Append(" - public class Document : Element - { - public enum QuirksModeEnum - { - NoQuirks, Quirks, LimitedQuirks - } - - public enum Syntax { html, xml } - - private OutputSettings _outputSettings = new OutputSettings(); - private QuirksModeEnum _quirksMode = QuirksModeEnum.NoQuirks; - private string _location = string.Empty; - private bool _updateMetaCharset = false; - - /// - /// Create a new, empty Document. - - /// - /// base URI of document - /// - /// - public Document(string baseUri) - : base(Tag.ValueOf("#root"), baseUri) - { - this._location = baseUri; - } - - protected Document() { } // Used for Node.Clone(). - - /// - /// Create a valid, empty shell of a document, suitable for adding more elements to. - /// - /// baseUri of document - /// document with html, head, and body elements. - static public Document CreateShell(string baseUri) - { - if (baseUri == null) - { - throw new ArgumentNullException("baseUri"); - } - - Document doc = new Document(baseUri); - Element html = doc.AppendElement("html"); - html.AppendElement("head"); - html.AppendElement("body"); - - return doc; - } - - public string Location { get { return this._location; } } - /// - /// Gets the document's head element. - /// - public Element Head - { - get { return FindFirstElementByTagName("head", this); } - } - - /// - /// Gets the document's body element. - /// - public Element Body - { - get { return FindFirstElementByTagName("body", this); } - } - - /// - /// Gets or sets the string contents of the document's {@code title} element. - /// On set, updates the existing element, or adds {@code title} to {@code head} if - /// not present. - /// - public string Title - { - get - { - // title is a preserve whitespace tag (for document output), but normalised here - var titleEl = GetElementsByTag("title").First; - return titleEl != null ? StringUtil.NormaliseWhitespace(titleEl.Text()).Trim() : string.Empty; - } - set - { - if (value == null) - { - throw new ArgumentNullException(); - } - - var titleEl = GetElementsByTag("title").First; - if (titleEl == null) - { // add to head - Head.AppendElement("title").Text(value); - } - else - { - titleEl.Text(value); - } - } - } - - /// - /// Gets the document's output settings. - /// - public OutputSettings OutputSettings() - { - return _outputSettings; - } - - /// - /// Sets the document's output settings. - /// - /// New output settings - /// This document, for chaining - public Document OutputSettings(OutputSettings outputSettings) - { - if (outputSettings == null) - { - throw new ArgumentNullException("outputSettings"); - } - - this._outputSettings = outputSettings; - return this; - } - - /// - /// Create a new Element, with this document's base uri. Does not make the new element a child of this document. - /// - /// element tag name (e.g. a) - /// new Element - public Element CreateElement(string tagName) - { - return new Element(Tag.ValueOf(tagName), this.BaseUri); - } - - /// - /// Normalise the document. This happens after the parse phase so generally does not need to be called. - /// Moves any text content that is not in the body element into the body. - /// - /// this document after normalisation - public Document Normalise() - { - var htmlEl = FindFirstElementByTagName("html", this); - if (htmlEl == null) - { - htmlEl = AppendElement("html"); - } - if (Head == null) - { - htmlEl.PrependElement("head"); - } - if (Body == null) - { - htmlEl.AppendElement("body"); - } - - // pull text nodes out of root, html, and head els, and push into body. non-text nodes are already taken care - // of. do in inverse order to maintain text order. - NormaliseTextNodes(Head); - NormaliseTextNodes(htmlEl); - NormaliseTextNodes(this); - - NormaliseStructure("head", htmlEl); - NormaliseStructure("body", htmlEl); - - EnsureMetaCharsetElement(); - - return this; - } - - public QuirksModeEnum QuirksMode() - { - return _quirksMode; - } - - public Document QuirksMode(QuirksModeEnum quirksMode) - { - this._quirksMode = quirksMode; - return this; - } - - // does not recurse. - private void NormaliseTextNodes(Element element) - { - var toMove = new List(); - foreach (var node in element.ChildNodes) - { - if (node is TextNode) - { - var tn = (TextNode)node; - if (!tn.IsBlank) - { - toMove.Add(tn); - } - } - } - - for (var i = toMove.Count - 1; i >= 0; i--) - { - var node = toMove[i]; - element.RemoveChild(node); - Body.PrependChild(new TextNode(" ", string.Empty)); - Body.PrependChild(node); - } - } - - // merge multiple or contents into one, delete the remainder, and ensure they are owned by - private void NormaliseStructure(string tag, Element htmlEl) - { - var elements = this.GetElementsByTag(tag); - - var master = elements.First; // will always be available as created above if not existent - if (elements.Count > 1) - { // dupes, move contents to master - var toMove = new List(); - - for (var i = 1; i < elements.Count; i++) - { - var dupe = elements[i]; - - foreach (var node in dupe.ChildNodes) - { - toMove.Add(node); - } - - dupe.Remove(); - } - - foreach (var dupe in toMove) - { - master.AppendChild(dupe); - } - } - - // ensure parented by - if (!master.Parent.Equals(htmlEl)) - { - htmlEl.AppendChild(master); // includes remove() - } - } - - // fast method to get first by tag name, used for html, head, body finders - private Element FindFirstElementByTagName(string tag, Node node) - { - if (node.NodeName.Equals(tag)) - { - return (Element)node; - } - else - { - foreach (var child in node.ChildNodes) - { - var found = FindFirstElementByTagName(tag, child); - if (found != null) - { - return found; - } - } - } - return null; - } - - public override string OuterHtml() - { - return base.Html(); - } - - /// - /// Set the text of the body of this document. Any existing nodes within the body will be cleared. - /// - /// unencoded text - /// this document - public override Element Text(string text) - { - Body.Text(text); // overridden to not nuke doc structure - return this; - } - - /// - /// Gets the node's name. - /// - public override string NodeName - { - get - { - return "#document"; - } - } - - public new object Clone() - { - Document clone = (Document)base.Clone(); - clone._outputSettings = (OutputSettings)this._outputSettings.Clone(); - return clone; - } - - public void charset(Encoding encoding) - { - SetUpdateMetaCharsetElement(true); - this._outputSettings.SetEncoding(encoding); - EnsureMetaCharsetElement(); - } - - public void SetUpdateMetaCharsetElement(bool update) - { - this._updateMetaCharset = update; - } - - public bool GetUpdateMetaCharsetElement() - { - return this._updateMetaCharset; - } - - public Encoding GetEncoding() - { - return _outputSettings.Encoding; - } - - private void EnsureMetaCharsetElement() - { - if (this._updateMetaCharset) - { - var syntax = this._outputSettings.GetSyntax; - - if (syntax == Syntax.html) - { - var metaCharset = Select("meta[charset]").First; - - if (metaCharset != null) - { - metaCharset.Attr("charset", GetEncoding().EncodingName); - } - else - { - if (Head != null) - { - Head.AppendElement("meta").Attr("charset", GetEncoding().EncodingName); - } - } - - // Remove obsolete elements - Select("meta[name=charset]").Remove(); - } - } - } - } - - /// - /// A Document's output settings control the form of the Text() and H tml() methods. - /// - public class OutputSettings : ICloneable - { - private Entities.EscapeMode _escapeMode = Entities.EscapeMode.Base; - private Encoding _encoding = Encoding.UTF8; - private Encoder _encoder = null; - private bool _prettyPrint = true; - private bool _outline = false; - private int _indentAmount = 1; - private Document.Syntax _syntax = Document.Syntax.html; - - public OutputSettings() - { - _encoder = _encoding.GetEncoder(); - } - - /// - /// Gets or sets the document's current HTML escape mode: base, which provides a limited set of named HTML - /// entities and escapes other characters as numbered entities for maximum compatibility; or extended, - /// which uses the complete set of HTML named entities. - ///

- /// The default escape mode is base. - ///

- public Entities.EscapeMode EscapeMode - { - get { return _escapeMode; } - } - - /// - /// Set the document's escape mode - /// - /// the new escape mode to use - /// the document's output settings, for chaining - public OutputSettings SetEscapeMode(Entities.EscapeMode escapeMode) - { - this._escapeMode = escapeMode; - return this; - } - - /// - /// Gets or sets the document's current output charset, which is used to control which characters are escaped when - /// generating HTML (via the html() methods), and which are kept intact. - ///

- /// Where possible (when parsing from a URL or File), the document's output charset is automatically set to the - /// input charset. Otherwise, it defaults to UTF-8. - ///

- public Encoding Encoding - { - get { return _encoding; } - } - - /// - /// Update the document's output charset. - /// - /// the new encoding to use. - /// the document's output settings, for chaining - public OutputSettings SetEncoding(Encoding encoding) - { - // todo: this should probably update the doc's meta charset - this._encoding = encoding; - this._encoder = _encoding.GetEncoder(); - return this; - } - - /// - /// Update the document's output charset. - /// - /// the new charset (by name) to use. - /// the document's output settings, for chaining - public OutputSettings SetEncoding(string encoding) - { - SetEncoding(Encoding.GetEncoding(encoding)); - return this; - } - - public Encoder Encoder - { - get { return _encoder; } - } - - public Document.Syntax GetSyntax - { - get - { - return _syntax; - } - } - - public OutputSettings SetSyntax(Document.Syntax syntax) - { - this._syntax = syntax; - return this; - } - - /// - /// Get if pretty printing is enabled. Default is true. If disabled, the HTML output methods will not re-format - /// the output, and the output will generally look like the input. - /// - /// if pretty printing is enabled. - public bool PrettyPrint() - { - return this._prettyPrint; - } - - /// - /// Enable or disable pretty printing. - /// - /// new pretty print setting - /// this, for chaining - public OutputSettings PrettyPrint(bool pretty) - { - this._prettyPrint = pretty; - return this; - } - - public bool GetOutline() - { - return this._outline; - } - - public OutputSettings outline(bool outlineMode) - { - this._outline = outlineMode; - return this; - } - - /// - /// Get the current tag indent amount, used when pretty printing. - /// - /// the current indent amount - public int IndentAmount() - { - return this._indentAmount; - } - - /// - /// Set the indent amount for pretty printing - /// - /// number of spaces to use for indenting each level. Must be >= 0. - /// this, for chaining - public OutputSettings IndentAmount(int indentAmount) - { - if (indentAmount < 0) - { - throw new ArgumentOutOfRangeException("indentAmount"); - } - this._indentAmount = indentAmount; - return this; - } - - #region ICloneable Members - - public object Clone() - { - OutputSettings clone = new OutputSettings(); - - clone.SetEncoding(_encoding.WebName); // new charset and charset encoder - clone.SetEscapeMode(_escapeMode); - clone.PrettyPrint(_prettyPrint); - clone.IndentAmount(_indentAmount); - - return clone; - } - - #endregion - } + /// + /// A HTML Document. + /// + /// + public class Document : Element + { + public enum QuirksModeEnum + { + NoQuirks, Quirks, LimitedQuirks + } + + public enum Syntax { html, xml } + + private OutputSettings _outputSettings = new OutputSettings(); + private QuirksModeEnum _quirksMode = QuirksModeEnum.NoQuirks; + private string _location = string.Empty; + private bool _updateMetaCharset = false; + + /// + /// Create a new, empty Document. + + /// + /// base URI of document + /// + /// + public Document(string baseUri) + : base(Tag.ValueOf("#root"), baseUri) + { + this._location = baseUri; + } + + protected Document() { } // Used for Node.Clone(). + + /// + /// Create a valid, empty shell of a document, suitable for adding more elements to. + /// + /// baseUri of document + /// document with html, head, and body elements. + static public Document CreateShell(string baseUri) + { + if (baseUri == null) + { + throw new ArgumentNullException("baseUri"); + } + + Document doc = new Document(baseUri); + Element html = doc.AppendElement("html"); + html.AppendElement("head"); + html.AppendElement("body"); + + return doc; + } + + public string Location { get { return this._location; } } + /// + /// Gets the document's head element. + /// + public Element Head + { + get { return FindFirstElementByTagName("head", this); } + } + + /// + /// Gets the document's body element. + /// + public Element Body + { + get { return FindFirstElementByTagName("body", this); } + } + + /// + /// Gets or sets the string contents of the document's {@code title} element. + /// On set, updates the existing element, or adds {@code title} to {@code head} if + /// not present. + /// + public string Title + { + get + { + // title is a preserve whitespace tag (for document output), but normalised here + var titleEl = GetElementsByTag("title").First; + return titleEl != null ? StringUtil.NormaliseWhitespace(titleEl.Text()).Trim() : string.Empty; + } + set + { + if (value == null) + { + throw new ArgumentNullException(); + } + + var titleEl = GetElementsByTag("title").First; + if (titleEl == null) + { // add to head + Head.AppendElement("title").Text(value); + } + else + { + titleEl.Text(value); + } + } + } + + /// + /// Gets the document's output settings. + /// + public OutputSettings OutputSettings() + { + return _outputSettings; + } + + /// + /// Sets the document's output settings. + /// + /// New output settings + /// This document, for chaining + public Document OutputSettings(OutputSettings outputSettings) + { + if (outputSettings == null) + { + throw new ArgumentNullException("outputSettings"); + } + + this._outputSettings = outputSettings; + return this; + } + + /// + /// Create a new Element, with this document's base uri. Does not make the new element a child of this document. + /// + /// element tag name (e.g. a) + /// new Element + public Element CreateElement(string tagName) + { + return new Element(Tag.ValueOf(tagName), this.BaseUri); + } + + /// + /// Normalise the document. This happens after the parse phase so generally does not need to be called. + /// Moves any text content that is not in the body element into the body. + /// + /// this document after normalisation + public Document Normalise() + { + var htmlEl = FindFirstElementByTagName("html", this); + if (htmlEl == null) + { + htmlEl = AppendElement("html"); + } + if (Head == null) + { + htmlEl.PrependElement("head"); + } + if (Body == null) + { + htmlEl.AppendElement("body"); + } + + // pull text nodes out of root, html, and head els, and push into body. non-text nodes are already taken care + // of. do in inverse order to maintain text order. + NormaliseTextNodes(Head); + NormaliseTextNodes(htmlEl); + NormaliseTextNodes(this); + + NormaliseStructure("head", htmlEl); + NormaliseStructure("body", htmlEl); + + EnsureMetaCharsetElement(); + + return this; + } + + public QuirksModeEnum QuirksMode() + { + return _quirksMode; + } + + public Document QuirksMode(QuirksModeEnum quirksMode) + { + this._quirksMode = quirksMode; + return this; + } + + // does not recurse. + private void NormaliseTextNodes(Element element) + { + var toMove = new List(); + foreach (var node in element.ChildNodes) + { + if (node is TextNode) + { + var tn = (TextNode)node; + if (!tn.IsBlank) + { + toMove.Add(tn); + } + } + } + + for (var i = toMove.Count - 1; i >= 0; i--) + { + var node = toMove[i]; + element.RemoveChild(node); + Body.PrependChild(new TextNode(" ", string.Empty)); + Body.PrependChild(node); + } + } + + // merge multiple or contents into one, delete the remainder, and ensure they are owned by + private void NormaliseStructure(string tag, Element htmlEl) + { + var elements = this.GetElementsByTag(tag); + + var master = elements.First; // will always be available as created above if not existent + if (elements.Count > 1) + { // dupes, move contents to master + var toMove = new List(); + + for (var i = 1; i < elements.Count; i++) + { + var dupe = elements[i]; + + foreach (var node in dupe.ChildNodes) + { + toMove.Add(node); + } + + dupe.Remove(); + } + + foreach (var dupe in toMove) + { + master.AppendChild(dupe); + } + } + + // ensure parented by + if (!master.Parent.Equals(htmlEl)) + { + htmlEl.AppendChild(master); // includes remove() + } + } + + // fast method to get first by tag name, used for html, head, body finders + private Element FindFirstElementByTagName(string tag, Node node) + { + if (node.NodeName.Equals(tag)) + { + return (Element)node; + } + else + { + foreach (var child in node.ChildNodes) + { + var found = FindFirstElementByTagName(tag, child); + if (found != null) + { + return found; + } + } + } + return null; + } + + public override string OuterHtml() + { + return base.Html(); + } + + /// + /// Set the text of the body of this document. Any existing nodes within the body will be cleared. + /// + /// unencoded text + /// this document + public override Element Text(string text) + { + Body.Text(text); // overridden to not nuke doc structure + return this; + } + + /// + /// Gets the node's name. + /// + public override string NodeName + { + get + { + return "#document"; + } + } + + public new object Clone() + { + Document clone = (Document)base.Clone(); + clone._outputSettings = (OutputSettings)this._outputSettings.Clone(); + return clone; + } + + public void charset(Encoding encoding) + { + SetUpdateMetaCharsetElement(true); + this._outputSettings.SetEncoding(encoding); + EnsureMetaCharsetElement(); + } + + public void SetUpdateMetaCharsetElement(bool update) + { + this._updateMetaCharset = update; + } + + public bool GetUpdateMetaCharsetElement() + { + return this._updateMetaCharset; + } + + public Encoding GetEncoding() + { + return _outputSettings.Encoding; + } + + private void EnsureMetaCharsetElement() + { + if (this._updateMetaCharset) + { + var syntax = this._outputSettings.GetSyntax; + + if (syntax == Syntax.html) + { + var metaCharset = Select("meta[charset]").First; + + if (metaCharset != null) + { + metaCharset.Attr("charset", GetEncoding().EncodingName); + } + else + { + if (Head != null) + { + Head.AppendElement("meta").Attr("charset", GetEncoding().EncodingName); + } + } + + // Remove obsolete elements + Select("meta[name=charset]").Remove(); + } + } + } + } + + /// + /// A Document's output settings control the form of the Text() and H tml() methods. + /// + public class OutputSettings : ICloneable + { + private Entities.EscapeMode _escapeMode = Entities.EscapeMode.Base; + private Encoding _encoding = Encoding.UTF8; + private Encoder _encoder = null; + private bool _prettyPrint = true; + private bool _outline = false; + private int _indentAmount = 1; + private Document.Syntax _syntax = Document.Syntax.html; + + public OutputSettings() + { + _encoder = _encoding.GetEncoder(); + } + + /// + /// Gets or sets the document's current HTML escape mode: base, which provides a limited set of named HTML + /// entities and escapes other characters as numbered entities for maximum compatibility; or extended, + /// which uses the complete set of HTML named entities. + ///

+ /// The default escape mode is base. + ///

+ public Entities.EscapeMode EscapeMode + { + get { return _escapeMode; } + } + + /// + /// Set the document's escape mode + /// + /// the new escape mode to use + /// the document's output settings, for chaining + public OutputSettings SetEscapeMode(Entities.EscapeMode escapeMode) + { + this._escapeMode = escapeMode; + return this; + } + + /// + /// Gets or sets the document's current output charset, which is used to control which characters are escaped when + /// generating HTML (via the html() methods), and which are kept intact. + ///

+ /// Where possible (when parsing from a URL or File), the document's output charset is automatically set to the + /// input charset. Otherwise, it defaults to UTF-8. + ///

+ public Encoding Encoding + { + get { return _encoding; } + } + + /// + /// Update the document's output charset. + /// + /// the new encoding to use. + /// the document's output settings, for chaining + public OutputSettings SetEncoding(Encoding encoding) + { + // todo: this should probably update the doc's meta charset + this._encoding = encoding; + this._encoder = _encoding.GetEncoder(); + return this; + } + + /// + /// Update the document's output charset. + /// + /// the new charset (by name) to use. + /// the document's output settings, for chaining + public OutputSettings SetEncoding(string encoding) + { + SetEncoding(Encoding.GetEncoding(encoding)); + return this; + } + + public Encoder Encoder + { + get { return _encoder; } + } + + public Document.Syntax GetSyntax + { + get + { + return _syntax; + } + } + + public OutputSettings SetSyntax(Document.Syntax syntax) + { + this._syntax = syntax; + return this; + } + + /// + /// Get if pretty printing is enabled. Default is true. If disabled, the HTML output methods will not re-format + /// the output, and the output will generally look like the input. + /// + /// if pretty printing is enabled. + public bool PrettyPrint() + { + return this._prettyPrint; + } + + /// + /// Enable or disable pretty printing. + /// + /// new pretty print setting + /// this, for chaining + public OutputSettings PrettyPrint(bool pretty) + { + this._prettyPrint = pretty; + return this; + } + + public bool GetOutline() + { + return this._outline; + } + + public OutputSettings outline(bool outlineMode) + { + this._outline = outlineMode; + return this; + } + + /// + /// Get the current tag indent amount, used when pretty printing. + /// + /// the current indent amount + public int IndentAmount() + { + return this._indentAmount; + } + + /// + /// Set the indent amount for pretty printing + /// + /// number of spaces to use for indenting each level. Must be >= 0. + /// this, for chaining + public OutputSettings IndentAmount(int indentAmount) + { + if (indentAmount < 0) + { + throw new ArgumentOutOfRangeException("indentAmount"); + } + this._indentAmount = indentAmount; + return this; + } + + #region ICloneable Members + + public object Clone() + { + OutputSettings clone = new OutputSettings(); + + clone.SetEncoding(_encoding.WebName); // new charset and charset encoder + clone.SetEscapeMode(_escapeMode); + clone.PrettyPrint(_prettyPrint); + clone.IndentAmount(_indentAmount); + + return clone; + } + + #endregion + } } diff --git a/NSoup/Nodes/DocumentType.cs b/NSoup/Nodes/DocumentType.cs index b968cc3..ad49b76 100644 --- a/NSoup/Nodes/DocumentType.cs +++ b/NSoup/Nodes/DocumentType.cs @@ -1,8 +1,6 @@ -using System; -using System.Collections.Generic; -using System.Linq; +using NSoup.Helper; +using System; using System.Text; -using NSoup.Helper; namespace NSoup.Nodes { @@ -41,12 +39,12 @@ public override string NodeName public override void OuterHtmlHead(StringBuilder accum, int depth, OutputSettings output) { accum.Append(" /// Replace this node in the DOM with the supplied node. /// @@ -796,7 +795,8 @@ public object Clone() return DoClone(null); } - protected Node DoClone(Node parent) { + protected Node DoClone(Node parent) + { Node clone; try { diff --git a/NSoup/Nodes/TextNode.cs b/NSoup/Nodes/TextNode.cs index 5144400..b9a1e63 100644 --- a/NSoup/Nodes/TextNode.cs +++ b/NSoup/Nodes/TextNode.cs @@ -1,10 +1,6 @@ -using System; -using System.Collections.Generic; -using System.Linq; +using NSoup.Helper; +using System; using System.Text; -using System.Web; -using System.Text.RegularExpressions; -using NSoup.Helper; namespace NSoup.Nodes { @@ -106,7 +102,7 @@ public TextNode SplitText(int offset) string head = GetWholeText().Substring(0, offset); string tail = GetWholeText().Substring(offset); - + Text(head); TextNode tailNode = new TextNode(tail, this.BaseUri); diff --git a/NSoup/Nodes/XmlDeclaration.cs b/NSoup/Nodes/XmlDeclaration.cs index 09c70c1..0436043 100644 --- a/NSoup/Nodes/XmlDeclaration.cs +++ b/NSoup/Nodes/XmlDeclaration.cs @@ -1,7 +1,4 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; +using System.Text; namespace NSoup.Nodes { diff --git a/NSoup/Parse/CharacterReader.cs b/NSoup/Parse/CharacterReader.cs index ba4585f..9790891 100644 --- a/NSoup/Parse/CharacterReader.cs +++ b/NSoup/Parse/CharacterReader.cs @@ -1,8 +1,5 @@ using System; -using System.Collections.Generic; using System.Globalization; -using System.Linq; -using System.Text; using System.Text.RegularExpressions; namespace NSoup.Parse @@ -112,15 +109,15 @@ public int NextIndexOf(string seq) // scan to first instance of startchar: if (startChar != _input[offset]) { - while (++offset < _length && startChar != _input[offset]); + while (++offset < _length && startChar != _input[offset]) ; } if (offset < _length) { int i = offset + 1; int last = i + seq.Length - 1; - + for (int j = 1; i < last && seq[j] == _input[i]; i++, j++) ; - + if (i == last) // found full sequence { return offset - _pos; @@ -243,7 +240,8 @@ public string ConsumeLetterThenDigitSequence() public string ConsumeHexSequence() { int start = _pos; - while (_pos < _length) { + while (_pos < _length) + { char c = _input[_pos]; if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')) { diff --git a/NSoup/Parse/HtmlTreeBuilder.cs b/NSoup/Parse/HtmlTreeBuilder.cs index 4af91bf..1f7578b 100644 --- a/NSoup/Parse/HtmlTreeBuilder.cs +++ b/NSoup/Parse/HtmlTreeBuilder.cs @@ -1,9 +1,8 @@ -using System; +using NSoup.Helper; +using NSoup.Nodes; +using System; using System.Collections.Generic; using System.Linq; -using System.Text; -using NSoup.Nodes; -using NSoup.Helper; namespace NSoup.Parse { @@ -36,7 +35,7 @@ public override Document Parse(string input, string baseUri, ParseErrorList erro _state = HtmlTreeBuilderState.Initial; return base.Parse(input, baseUri, errors); } - + public IList ParseFragment(string inputFragment, Element context, string baseUri, ParseErrorList errors) { @@ -395,21 +394,23 @@ public void ClearStackToTableRowContext() ClearStackToContext("tr"); } - private void ClearStackToContext(params string[] nodeNames) { + private void ClearStackToContext(params string[] nodeNames) + { LinkedListNode node = _stack.Last; - while (node != null) { - Element next = node.Value; - if (StringUtil.In(next.NodeName, nodeNames) || next.NodeName.Equals("html")) + while (node != null) { - break; - } - else - { - _stack.Remove(node); - node = node.Previous; + Element next = node.Value; + if (StringUtil.In(next.NodeName, nodeNames) || next.NodeName.Equals("html")) + { + break; + } + else + { + _stack.Remove(node); + node = node.Previous; + } } } - } public Element AboveOnStack(Element el) { @@ -451,7 +452,7 @@ public void ResetInsertionMode() while (it.MoveNext()) { Element node = it.Current; - + if (_stack.FindLast(node).Previous == null) { last = true; @@ -713,7 +714,7 @@ private bool IsSameFormattingElement(Element a, Element b) { // same if: same namespace, tag, and attributes. Element.equals only checks tag, might in future check children return a.NodeName.Equals(b.NodeName) && - // a.namespace().equals(b.namespace()) && + // a.namespace().equals(b.namespace()) && a.Attributes.Equals(b.Attributes); // todo: namespaces } @@ -766,7 +767,7 @@ public void ReconstructFormattingElements() // 11 if (pos == size - 1) // if not last entry in list, jump to 7 - { + { break; } } diff --git a/NSoup/Parse/HtmlTreeBuilderState.cs b/NSoup/Parse/HtmlTreeBuilderState.cs index d850336..a665e4a 100644 --- a/NSoup/Parse/HtmlTreeBuilderState.cs +++ b/NSoup/Parse/HtmlTreeBuilderState.cs @@ -1,9 +1,8 @@ -using System; +using NSoup.Helper; +using NSoup.Nodes; +using System; using System.Collections.Generic; using System.Linq; -using System.Text; -using NSoup.Nodes; -using NSoup.Helper; namespace NSoup.Parse { diff --git a/NSoup/Parse/ParseError.cs b/NSoup/Parse/ParseError.cs index 6fa8905..a18856d 100644 --- a/NSoup/Parse/ParseError.cs +++ b/NSoup/Parse/ParseError.cs @@ -1,9 +1,4 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; - -namespace NSoup.Parse +namespace NSoup.Parse { /// /// A Parse Error records an error in the input HTML that occurs in either the tokenisation or the tree building phase. @@ -11,7 +6,7 @@ namespace NSoup.Parse // todo: currently not ready for public consumption. revisit api, and exposure methods public class ParseError { - + private int _pos; private string _errorMsg; diff --git a/NSoup/Parse/ParseErrorList.cs b/NSoup/Parse/ParseErrorList.cs index 48594f8..0192a5d 100644 --- a/NSoup/Parse/ParseErrorList.cs +++ b/NSoup/Parse/ParseErrorList.cs @@ -1,7 +1,4 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; +using System.Collections.Generic; namespace NSoup.Parse { diff --git a/NSoup/Parse/Parser.cs b/NSoup/Parse/Parser.cs index 3612d4e..0e1a129 100644 --- a/NSoup/Parse/Parser.cs +++ b/NSoup/Parse/Parser.cs @@ -1,8 +1,7 @@ -using System; +using NSoup.Nodes; +using System; using System.Collections.Generic; using System.Linq; -using System.Text; -using NSoup.Nodes; namespace NSoup.Parse { @@ -16,7 +15,7 @@ namespace NSoup.Parse public class Parser { private static readonly int DEFAULT_MAX_ERRORS = 0; // by default, error tracking is disabled. - + private TreeBuilder _treeBuilder; private int _maxErrors = DEFAULT_MAX_ERRORS; private ParseErrorList _errors; @@ -25,11 +24,13 @@ public class Parser /// Create a new Parser, using the specified TreeBuilder /// /// TreeBuilder to use to parse input into Documents. - private Parser(TreeBuilder treeBuilder) { + private Parser(TreeBuilder treeBuilder) + { this._treeBuilder = treeBuilder; } - - public Document ParseInput(string html, string baseUri) { + + public Document ParseInput(string html, string baseUri) + { _errors = IsTrackErrors ? ParseErrorList.Tracking(_maxErrors) : ParseErrorList.NoTracking(); Document doc = _treeBuilder.Parse(html, baseUri, _errors); return doc; @@ -49,7 +50,8 @@ public TreeBuilder TreeBuilder() /// /// Current TreeBuilder /// this, for chaining - public Parser TreeBuilder(TreeBuilder treeBuilder) { + public Parser TreeBuilder(TreeBuilder treeBuilder) + { this._treeBuilder = treeBuilder; return this; } @@ -67,7 +69,8 @@ public bool IsTrackErrors /// /// The maximum number of errors to track. Set to 0 to disable. /// this, for chaining - public Parser SetTrackErrors(int maxErrors) { + public Parser SetTrackErrors(int maxErrors) + { this._maxErrors = maxErrors; return this; } @@ -76,7 +79,8 @@ public Parser SetTrackErrors(int maxErrors) { /// Retrieve the parse errors, if any, from the last parse. /// /// List of parse errors, up to the size of the maximum errors tracked. - public List GetErrors() { + public List GetErrors() + { return _errors; } @@ -134,7 +138,8 @@ public static Document ParseBodyFragment(string bodyHtml, string baseUri) /// HTML escaped string /// If the string is to be escaped in strict mode (as attributes are) /// An unescaped string - public static string UnescapeEntities(string s, bool inAttribute) { + public static string UnescapeEntities(string s, bool inAttribute) + { Tokeniser tokeniser = new Tokeniser(new CharacterReader(s), ParseErrorList.NoTracking()); return tokeniser.UnescapeEntities(inAttribute); } diff --git a/NSoup/Parse/Tag.cs b/NSoup/Parse/Tag.cs index a28e167..dd000ce 100644 --- a/NSoup/Parse/Tag.cs +++ b/NSoup/Parse/Tag.cs @@ -1,7 +1,5 @@ using System; using System.Collections.Generic; -using System.Linq; -using System.Text; namespace NSoup.Parse { diff --git a/NSoup/Parse/Token.cs b/NSoup/Parse/Token.cs index f6cfd22..5692e51 100644 --- a/NSoup/Parse/Token.cs +++ b/NSoup/Parse/Token.cs @@ -1,14 +1,12 @@ -using System; -using System.Collections.Generic; -using System.Linq; +using NSoup.Nodes; +using System; using System.Text; -using NSoup.Nodes; namespace NSoup.Parse { -/// -/// Parse tokens for the Tokeniser. -/// + /// + /// Parse tokens for the Tokeniser. + /// public abstract class Token { TokenType _type; @@ -189,7 +187,7 @@ public override string ToString() { return string.Format("<{0} {1}>", Name(), Attributes.ToString()); } - + return string.Format("<{0}>", Name()); } } diff --git a/NSoup/Parse/TokenQueue.cs b/NSoup/Parse/TokenQueue.cs index c116b23..0d2a1c4 100644 --- a/NSoup/Parse/TokenQueue.cs +++ b/NSoup/Parse/TokenQueue.cs @@ -1,7 +1,5 @@ using NSoup.Helper; using System; -using System.Collections.Generic; -using System.Linq; using System.Text; namespace NSoup.Parse diff --git a/NSoup/Parse/Tokeniser.cs b/NSoup/Parse/Tokeniser.cs index 42eaff8..372dd3b 100644 --- a/NSoup/Parse/Tokeniser.cs +++ b/NSoup/Parse/Tokeniser.cs @@ -1,15 +1,13 @@ -using System; -using System.Collections.Generic; -using System.Linq; +using NSoup.Nodes; +using System; using System.Text; -using NSoup.Nodes; namespace NSoup.Parse { -/// -/// Readers the input stream into tokens. -/// + /// + /// Readers the input stream into tokens. + /// public class Tokeniser { public const char ReplacementChar = '\uFFFD'; // replaces null character diff --git a/NSoup/Parse/TokeniserState.cs b/NSoup/Parse/TokeniserState.cs index 4de544e..1916101 100644 --- a/NSoup/Parse/TokeniserState.cs +++ b/NSoup/Parse/TokeniserState.cs @@ -1,9 +1,4 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; - -namespace NSoup.Parse +namespace NSoup.Parse { /** * States and Transition activations for the Tokeniser. @@ -262,7 +257,7 @@ public override void Read(Tokeniser t, CharacterReader r) t.EofError(this); t.Transition(Data); break; - // no default, as covered with above ConsumeToAny + // no default, as covered with above ConsumeToAny } } }; @@ -722,7 +717,7 @@ public override void Read(Tokeniser t, CharacterReader r) string name = r.ConsumeLetterSequence(); t.TagPending.AppendTagName(name.ToLowerInvariant()); t.DataBuffer.Append(name); - + return; } @@ -1043,7 +1038,7 @@ public override void Read(Tokeniser t, CharacterReader r) t.Error(this); t.TagPending.AppendAttributeName(c); break; - // no default, as covered in ConsumeToAny + // no default, as covered in ConsumeToAny } } }; @@ -1183,7 +1178,7 @@ public override void Read(Tokeniser t, CharacterReader r) t.EofError(this); t.Transition(Data); break; - // no default, handled in Consume to any above + // no default, handled in Consume to any above } } }; @@ -1222,7 +1217,7 @@ public override void Read(Tokeniser t, CharacterReader r) t.EofError(this); t.Transition(Data); break; - // no default, handled in Consume to any above + // no default, handled in Consume to any above } } }; @@ -1277,7 +1272,7 @@ public override void Read(Tokeniser t, CharacterReader r) t.Error(this); t.TagPending.AppendAttributeValue(c); break; - // no default, handled in Consume to any above + // no default, handled in Consume to any above } } @@ -2119,7 +2114,7 @@ public override void Read(Tokeniser t, CharacterReader r) t.Error(this); t.Transition(BogusDoctype); break; - // NOT force quirks + // NOT force quirks } } }; diff --git a/NSoup/Parse/TreeBuilder.cs b/NSoup/Parse/TreeBuilder.cs index edc9dac..bcb7606 100644 --- a/NSoup/Parse/TreeBuilder.cs +++ b/NSoup/Parse/TreeBuilder.cs @@ -1,9 +1,6 @@ using NSoup.Helper; using NSoup.Nodes; using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; namespace NSoup.Parse { diff --git a/NSoup/Parse/TreeBuilderState.cs b/NSoup/Parse/TreeBuilderState.cs deleted file mode 100644 index 546af51..0000000 --- a/NSoup/Parse/TreeBuilderState.cs +++ /dev/null @@ -1,2223 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using NSoup.Nodes; -using NSoup.Helper; - -namespace NSoup.Parse -{ - /// - /// The Tree Builder's current state. Each state embodies the processing for the state, and transitions to other states. - /// - internal abstract class TreeBuilderState - { - #region Subclasses - - protected class InitialState : TreeBuilderState - { - public override bool Process(Token t, TreeBuilder tb) - { - if (IsWhitespace(t)) - { - return true; // ignore whitespace - } - else if (t.IsComment()) - { - tb.Insert(t.AsComment()); - } - else if (t.IsDoctype()) - { - // todo: parse error check on expected doctypes - // todo: quirk state check on doctype ids - Token.Doctype d = t.AsDoctype(); - DocumentType doctype = new DocumentType(d.Name.ToString(), d.PublicIdentifier.ToString(), d.SystemIdentifier.ToString(), tb.BaseUri.ToString()); - tb.Document.AppendChild(doctype); - - if (d.ForceQuirks) - { - tb.Document.QuirksMode(Document.QuirksModeEnum.Quirks); - } - tb.Transition(BeforeHtml); - } - else - { - // todo: check not iframe srcdoc - tb.Transition(BeforeHtml); - return tb.Process(t); // re-process token - } - return true; - } - }; - protected class BeforeHtmlState : TreeBuilderState - { - public override bool Process(Token t, TreeBuilder tb) - { - if (t.IsDoctype()) - { - tb.Error(this); - return false; - } - else if (t.IsComment()) - { - tb.Insert(t.AsComment()); - } - else if (IsWhitespace(t)) - { - return true; // ignore whitespace - } - else if (t.IsStartTag() && t.AsStartTag().Name().Equals("html")) - { - tb.Insert(t.AsStartTag()); - tb.Transition(BeforeHead); - } - else if (t.IsEndTag() && (StringUtil.In(t.AsEndTag().Name(), "head", "body", "html", "br"))) - { - return AnythingElse(t, tb); - } - else if (t.IsEndTag()) - { - tb.Error(this); - return false; - } - else - { - return AnythingElse(t, tb); - } - return true; - } - - private bool AnythingElse(Token t, TreeBuilder tb) - { - tb.Insert("html"); - tb.Transition(BeforeHead); - return tb.Process(t); - } - }; - protected class BeforeHeadState : TreeBuilderState - { - public override bool Process(Token t, TreeBuilder tb) - { - if (IsWhitespace(t)) - { - return true; - } - else if (t.IsComment()) - { - tb.Insert(t.AsComment()); - } - else if (t.IsDoctype()) - { - tb.Error(this); - return false; - } - else if (t.IsStartTag() && t.AsStartTag().Name().Equals("html")) - { - return InBody.Process(t, tb); // does not transition - } - else if (t.IsStartTag() && t.AsStartTag().Name().Equals("head")) - { - Element head = tb.Insert(t.AsStartTag()); - tb.HeadElement = head; - tb.Transition(InHead); - } - else if (t.IsEndTag() && (StringUtil.In(t.AsEndTag().Name(), "head", "body", "html", "br"))) - { - tb.Process(new Token.StartTag("head")); - return tb.Process(t); - } - else if (t.IsEndTag()) - { - tb.Error(this); - return false; - } - else - { - tb.Process(new Token.StartTag("head")); - return tb.Process(t); - } - return true; - } - }; - protected class InHeadState : TreeBuilderState - { - public override bool Process(Token t, TreeBuilder tb) - { - if (IsWhitespace(t)) - { - tb.Insert(t.AsCharacter()); - return true; - } - - switch (t.Type) - { - case Token.TokenType.Comment: - tb.Insert(t.AsComment()); - break; - case Token.TokenType.Doctype: - tb.Error(this); - return false; - case Token.TokenType.StartTag: - Token.StartTag start = t.AsStartTag(); - string name = start.Name(); - if (name.Equals("html")) - { - return InBody.Process(t, tb); - } - else if (StringUtil.In(name, "base", "basefont", "bgsound", "command", "link")) - { - Element el = tb.InsertEmpty(start); - // jsoup special: update base as it is seen. todo: flip to current browser behaviour of one shot - if (name.Equals("base") && el.HasAttr("href")) - tb.SetBaseUri(el); - } - else if (name.Equals("meta")) - { - Element meta = tb.InsertEmpty(start); - // todo: charset switches - } - else if (name.Equals("title")) - { - HandleRcData(start, tb); - } - else if (StringUtil.In(name, "noframes", "style")) - { - HandleRawText(start, tb); - } - else if (name.Equals("noscript")) - { - // else if noscript && scripting flag = true: rawtext (jsoup doesn't run script, to handle as noscript) - tb.Insert(start); - tb.Transition(InHeadNoscript); - } - else if (name.Equals("script")) - { - // skips some script rules as won't execute them - tb.Insert(start); - tb.Tokeniser.Transition(TokeniserState.ScriptData); - tb.MarkInsertionMode(); - tb.Transition(Text); - } - else if (name.Equals("head")) - { - tb.Error(this); - return false; - } - else - { - return AnythingElse(t, tb); - } - break; - case Token.TokenType.EndTag: - Token.EndTag end = t.AsEndTag(); - name = end.Name(); - if (name.Equals("head")) - { - tb.Pop(); - tb.Transition(AfterHead); - } - else if (StringUtil.In(name, "body", "html", "br")) - { - return AnythingElse(t, tb); - } - else - { - tb.Error(this); - return false; - } - break; - default: - return AnythingElse(t, tb); - } - return true; - } - - private bool AnythingElse(Token t, TreeBuilder tb) - { - tb.Process(new Token.EndTag("head")); - return tb.Process(t); - } - }; - protected class InHeadNoscriptState : TreeBuilderState - { - public override bool Process(Token t, TreeBuilder tb) - { - if (t.IsDoctype()) - { - tb.Error(this); - } - else if (t.IsStartTag() && t.AsStartTag().Name().Equals("html")) - { - return tb.Process(t, InBody); - } - else if (t.IsEndTag() && t.AsEndTag().Name().Equals("noscript")) - { - tb.Pop(); - tb.Transition(InHead); - } - else if (IsWhitespace(t) || t.IsComment() || (t.IsStartTag() && StringUtil.In(t.AsStartTag().Name(), - "basefont", "bgsound", "link", "meta", "noframes", "style"))) - { - return tb.Process(t, InHead); - } - else if (t.IsEndTag() && t.AsEndTag().Name().Equals("br")) - { - return AnythingElse(t, tb); - } - else if ((t.IsStartTag() && StringUtil.In(t.AsStartTag().Name(), "head", "noscript")) || t.IsEndTag()) - { - tb.Error(this); - return false; - } - else - { - return AnythingElse(t, tb); - } - return true; - } - - private bool AnythingElse(Token t, TreeBuilder tb) - { - tb.Error(this); - tb.Process(new Token.EndTag("noscript")); - return tb.Process(t); - } - }; - protected class AfterHeadState : TreeBuilderState - { - public override bool Process(Token t, TreeBuilder tb) - { - if (IsWhitespace(t)) - { - tb.Insert(t.AsCharacter()); - } - else if (t.IsComment()) - { - tb.Insert(t.AsComment()); - } - else if (t.IsDoctype()) - { - tb.Error(this); - } - else if (t.IsStartTag()) - { - Token.StartTag startTag = t.AsStartTag(); - string name = startTag.Name(); - if (name.Equals("html")) - { - return tb.Process(t, InBody); - } - else if (name.Equals("body")) - { - tb.Insert(startTag); - tb.FramesetOk(false); - tb.Transition(InBody); - } - else if (name.Equals("frameset")) - { - tb.Insert(startTag); - tb.Transition(InFrameset); - } - else if (StringUtil.In(name, "base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "title")) - { - tb.Error(this); - Element head = tb.HeadElement; - tb.Push(head); - tb.Process(t, InHead); - tb.RemoveFromStack(head); - } - else if (name.Equals("head")) - { - tb.Error(this); - return false; - } - else - { - AnythingElse(t, tb); - } - } - else if (t.IsEndTag()) - { - if (StringUtil.In(t.AsEndTag().Name(), "body", "html")) - { - AnythingElse(t, tb); - } - else - { - tb.Error(this); - return false; - } - } - else - { - AnythingElse(t, tb); - } - return true; - } - - private bool AnythingElse(Token t, TreeBuilder tb) - { - tb.Process(new Token.StartTag("body")); - tb.FramesetOk(true); - return tb.Process(t); - } - }; - protected class InBodyState : TreeBuilderState - { - public override bool Process(Token t, TreeBuilder tb) - { - switch (t.Type) - { - case Token.TokenType.Character: - Token.Character c = t.AsCharacter(); - if (c.Data.Equals(_nullString)) - { - // todo confirm that check - tb.Error(this); - return false; - } - else if (IsWhitespace(c)) - { - tb.ReconstructFormattingElements(); - tb.Insert(c); - } - else - { - tb.ReconstructFormattingElements(); - tb.Insert(c); - tb.FramesetOk(false); - } - break; - case Token.TokenType.Comment: - tb.Insert(t.AsComment()); - break; - case Token.TokenType.Doctype: - tb.Error(this); - return false; - case Token.TokenType.StartTag: - Token.StartTag startTag = t.AsStartTag(); - string name = startTag.Name(); - if (name.Equals("html")) - { - tb.Error(this); - // merge attributes onto real html - Element html = tb.Stack.First.Value; - foreach (NSoup.Nodes.Attribute attribute in startTag.Attributes) - { - if (!html.HasAttr(attribute.Key)) - { - html.Attributes.Add(attribute); - } - } - } - else if (StringUtil.In(name, "base", "basefont", "bgsound", "command", "link", "meta", "noframes", "script", "style", "title")) - { - return tb.Process(t, InHead); - } - else if (name.Equals("body")) - { - tb.Error(this); - LinkedList stack = tb.Stack; - if (stack.Count == 1 || (stack.Count > 2 && !stack.ElementAt(1).NodeName.Equals("body"))) - { - // only in fragment case - return false; // ignore - } - else - { - tb.FramesetOk(false); - Element body = stack.ElementAt(1); - foreach (NSoup.Nodes.Attribute attribute in startTag.Attributes) - { - if (!body.HasAttr(attribute.Key)) - { - body.Attributes.Add(attribute); - } - } - } - } - else if (name.Equals("frameset")) - { - tb.Error(this); - LinkedList stack = tb.Stack; - if (stack.Count == 1 || (stack.Count > 2 && !stack.ElementAt(1).NodeName.Equals("body"))) - { - // only in fragment case - return false; // ignore - } - else if (!tb.FramesetOk()) - { - return false; // ignore frameset - } - else - { - Element second = stack.ElementAt(1); - if (second.Parent != null) - second.Remove(); - // pop up to html element - - while (stack.Count > 1) - { - stack.RemoveLast(); - } - - tb.Insert(startTag); - tb.Transition(InFrameset); - } - } - else if (StringUtil.In(name, - "address", "article", "aside", "blockquote", "center", "details", "dir", "div", "dl", - "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu", "nav", "ol", - "p", "section", "summary", "ul")) - { - if (tb.InButtonScope("p")) - { - tb.Process(new Token.EndTag("p")); - } - tb.Insert(startTag); - } - else if (StringUtil.In(name, "h1", "h2", "h3", "h4", "h5", "h6")) - { - if (tb.InButtonScope("p")) - { - tb.Process(new Token.EndTag("p")); - } - if (StringUtil.In(tb.CurrentElement.NodeName, "h1", "h2", "h3", "h4", "h5", "h6")) - { - tb.Error(this); - tb.Pop(); - } - tb.Insert(startTag); - } - else if (StringUtil.In(name, "pre", "listing")) - { - if (tb.InButtonScope("p")) - { - tb.Process(new Token.EndTag("p")); - } - tb.Insert(startTag); - // todo: ignore LF if next token - tb.FramesetOk(false); - } - else if (name.Equals("form")) - { - if (tb.FormElement != null) - { - tb.Error(this); - return false; - } - if (tb.InButtonScope("p")) - { - tb.Process(new Token.EndTag("p")); - } - Element form = tb.Insert(startTag); - tb.FormElement = form; - } - else if (name.Equals("li")) - { - tb.FramesetOk(false); - LinkedList stack = tb.Stack; - for (int i = stack.Count - 1; i > 0; i--) - { - Element el = stack.ElementAt(i); - if (el.NodeName.Equals("li")) - { - tb.Process(new Token.EndTag("li")); - break; - } - if (tb.IsSpecial(el) && !StringUtil.In(el.NodeName, "address", "div", "p")) - break; - } - if (tb.InButtonScope("p")) - { - tb.Process(new Token.EndTag("p")); - } - tb.Insert(startTag); - } - else if (StringUtil.In(name, "dd", "dt")) - { - tb.FramesetOk(false); - LinkedList stack = tb.Stack; - for (int i = stack.Count - 1; i > 0; i--) - { - Element el = stack.ElementAt(i); - if (StringUtil.In(el.NodeName, "dd", "dt")) - { - tb.Process(new Token.EndTag(el.NodeName)); - break; - } - - if (tb.IsSpecial(el) && !StringUtil.In(el.NodeName, "address", "div", "p")) - { - break; - } - } - if (tb.InButtonScope("p")) - { - tb.Process(new Token.EndTag("p")); - } - tb.Insert(startTag); - } - else if (name.Equals("plaintext")) - { - if (tb.InButtonScope("p")) - { - tb.Process(new Token.EndTag("p")); - } - tb.Insert(startTag); - tb.Tokeniser.Transition(TokeniserState.PlainText); // once in, never gets out - } - else if (name.Equals("button")) - { - if (tb.InButtonScope("button")) - { - // close and reprocess - tb.Error(this); - tb.Process(new Token.EndTag("button")); - tb.Process(startTag); - } - else - { - tb.ReconstructFormattingElements(); - tb.Insert(startTag); - tb.FramesetOk(false); - } - } - else if (name.Equals("a")) - { - if (tb.GetActiveFormattingElement("a") != null) - { - tb.Error(this); - tb.Process(new Token.EndTag("a")); - - // still on stack? - Element remainingA = tb.GetFromStack("a"); - if (remainingA != null) - { - tb.RemoveFromActiveFormattingElements(remainingA); - tb.RemoveFromStack(remainingA); - } - } - tb.ReconstructFormattingElements(); - Element a = tb.Insert(startTag); - tb.PushActiveFormattingElements(a); - } - else if (StringUtil.In(name, - "b", "big", "code", "em", "font", "i", "s", "small", "strike", "strong", "tt", "u")) - { - tb.ReconstructFormattingElements(); - Element el = tb.Insert(startTag); - tb.PushActiveFormattingElements(el); - } - else if (name.Equals("nobr")) - { - tb.ReconstructFormattingElements(); - if (tb.InScope("nobr")) - { - tb.Error(this); - tb.Process(new Token.EndTag("nobr")); - tb.ReconstructFormattingElements(); - } - Element el = tb.Insert(startTag); - tb.PushActiveFormattingElements(el); - } - else if (StringUtil.In(name, "applet", "marquee", "object")) - { - tb.ReconstructFormattingElements(); - tb.Insert(startTag); - tb.InsertMarkerToFormattingElements(); - tb.FramesetOk(false); - } - else if (name.Equals("table")) - { - if (tb.Document.QuirksMode() != Document.QuirksModeEnum.Quirks && tb.InButtonScope("p")) - { - tb.Process(new Token.EndTag("p")); - } - tb.Insert(startTag); - tb.FramesetOk(false); - tb.Transition(InTable); - } - else if (StringUtil.In(name, "area", "br", "embed", "img", "keygen", "wbr")) - { - tb.ReconstructFormattingElements(); - tb.InsertEmpty(startTag); - tb.FramesetOk(false); - } - else if (name.Equals("input")) - { - tb.ReconstructFormattingElements(); - Element el = tb.InsertEmpty(startTag); - - if (!el.Attr("type").Equals("hidden", StringComparison.InvariantCultureIgnoreCase)) - { - tb.FramesetOk(false); - } - } - else if (StringUtil.In(name, "param", "source", "track")) - { - tb.InsertEmpty(startTag); - } - else if (name.Equals("hr")) - { - if (tb.InButtonScope("p")) - { - tb.Process(new Token.EndTag("p")); - } - tb.InsertEmpty(startTag); - tb.FramesetOk(false); - } - else if (name.Equals("image")) - { - // we're not supposed to ask. - startTag.Name("img"); - return tb.Process(startTag); - } - else if (name.Equals("isindex")) - { - // how much do we care about the early 90s? - tb.Error(this); - - if (tb.FormElement != null) - { - return false; - } - - tb.Tokeniser.AcknowledgeSelfClosingFlag(); - tb.Process(new Token.StartTag("form")); - if (startTag.Attributes.ContainsKey("action")) - { - Element form = tb.FormElement; - form.Attr("action", startTag.Attributes["action"]); - } - tb.Process(new Token.StartTag("hr")); - tb.Process(new Token.StartTag("label")); - // hope you like english. - string prompt = startTag.Attributes.ContainsKey("prompt") ? - startTag.Attributes["prompt"] : - "This is a searchable index. Enter search keywords: "; - - tb.Process(new Token.Character(prompt)); - - // input - Attributes inputAttribs = new Attributes(); - foreach (NSoup.Nodes.Attribute attr in startTag.Attributes) - { - if (!StringUtil.In(attr.Key, "name", "action", "prompt")) - { - inputAttribs.Add(attr); - } - } - inputAttribs["name"] = "isindex"; - tb.Process(new Token.StartTag("input", inputAttribs)); - tb.Process(new Token.EndTag("label")); - tb.Process(new Token.StartTag("hr")); - tb.Process(new Token.EndTag("form")); - } - else if (name.Equals("textarea")) - { - tb.Insert(startTag); - // todo: If the next token is a U+000A LINE FEED (LF) character token, then ignore that token and move on to the next one. (Newlines at the start of textarea elements are ignored as an authoring convenience.) - tb.Tokeniser.Transition(TokeniserState.RcData); - tb.MarkInsertionMode(); - tb.FramesetOk(false); - tb.Transition(Text); - } - else if (name.Equals("xmp")) - { - if (tb.InButtonScope("p")) - { - tb.Process(new Token.EndTag("p")); - } - tb.ReconstructFormattingElements(); - tb.FramesetOk(false); - HandleRawText(startTag, tb); - } - else if (name.Equals("iframe")) - { - tb.FramesetOk(false); - HandleRawText(startTag, tb); - } - else if (name.Equals("noembed")) - { - // also handle noscript if script enabled - HandleRawText(startTag, tb); - } - else if (name.Equals("select")) - { - tb.ReconstructFormattingElements(); - tb.Insert(startTag); - tb.FramesetOk(false); - - TreeBuilderState state = tb.State; - if (state.Equals(InTable) || state.Equals(InCaption) || state.Equals(InTableBody) || state.Equals(InRow) || state.Equals(InCell)) - { - tb.Transition(InSelectInTable); - } - else - { - tb.Transition(InSelect); - } - } - else if (StringUtil.In("optgroup", "option")) - { - if (tb.CurrentElement.NodeName.Equals("option")) - { - tb.Process(new Token.EndTag("option")); - } - tb.ReconstructFormattingElements(); - tb.Insert(startTag); - } - else if (StringUtil.In("rp", "rt")) - { - if (tb.InScope("ruby")) - { - tb.GenerateImpliedEndTags(); - if (!tb.CurrentElement.NodeName.Equals("ruby")) - { - tb.Error(this); - tb.PopStackToBefore("ruby"); // i.e. close up to but not include name - } - tb.Insert(startTag); - } - } - else if (name.Equals("math")) - { - tb.ReconstructFormattingElements(); - // todo: handle A start tag whose tag name is "math" (i.e. foreign, mathml) - tb.Insert(startTag); - tb.Tokeniser.AcknowledgeSelfClosingFlag(); - } - else if (name.Equals("svg")) - { - tb.ReconstructFormattingElements(); - // todo: handle A start tag whose tag name is "svg" (xlink, svg) - tb.Insert(startTag); - tb.Tokeniser.AcknowledgeSelfClosingFlag(); - } - else if (StringUtil.In(name, - "caption", "col", "colgroup", "frame", "head", "tbody", "td", "tfoot", "th", "thead", "tr")) - { - tb.Error(this); - return false; - } - else - { - tb.ReconstructFormattingElements(); - tb.Insert(startTag); - } - break; - case Token.TokenType.EndTag: - Token.EndTag endTag = t.AsEndTag(); - name = endTag.Name(); - if (name.Equals("body")) - { - if (!tb.InScope("body")) - { - tb.Error(this); - return false; - } - else - { - // todo: error if stack contains something not dd, dt, li, optgroup, option, p, rp, rt, tbody, td, tfoot, th, thead, tr, body, html - tb.Transition(AfterBody); - } - } - else if (name.Equals("html")) - { - bool notIgnored = tb.Process(new Token.EndTag("body")); - if (notIgnored) - { - return tb.Process(endTag); - } - } - else if (StringUtil.In(name, - "address", "article", "aside", "blockquote", "button", "center", "details", "dir", "div", - "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "listing", "menu", - "nav", "ol", "pre", "section", "summary", "ul")) - { - // todo: refactor these lookups - if (!tb.InScope(name)) - { - // nothing to close - tb.Error(this); - return false; - } - else - { - tb.GenerateImpliedEndTags(); - if (!tb.CurrentElement.NodeName.Equals(name)) - { - tb.Error(this); - } - tb.PopStackToClose(name); - } - } - else if (name.Equals("form")) - { - Element currentForm = tb.FormElement; - tb.FormElement = null; - if (currentForm == null || !tb.InScope(name)) - { - tb.Error(this); - return false; - } - else - { - tb.GenerateImpliedEndTags(); - if (!tb.CurrentElement.NodeName.Equals(name)) - { - tb.Error(this); - } - // remove currentForm from stack. will shift anything under up. - tb.RemoveFromStack(currentForm); - } - } - else if (name.Equals("p")) - { - if (!tb.InButtonScope(name)) - { - tb.Error(this); - tb.Process(new Token.StartTag(name)); // if no p to close, creates an empty

- return tb.Process(endTag); - } - else - { - tb.GenerateImpliedEndTags(name); - if (!tb.CurrentElement.NodeName.Equals(name)) - { - tb.Error(this); - } - tb.PopStackToClose(name); - } - } - else if (name.Equals("li")) - { - if (!tb.InListItemScope(name)) - { - tb.Error(this); - return false; - } - else - { - tb.GenerateImpliedEndTags(name); - if (!tb.CurrentElement.NodeName.Equals(name)) - { - tb.Error(this); - } - tb.PopStackToClose(name); - } - } - else if (StringUtil.In(name, "dd", "dt")) - { - if (!tb.InScope(name)) - { - tb.Error(this); - return false; - } - else - { - tb.GenerateImpliedEndTags(name); - if (!tb.CurrentElement.NodeName.Equals(name)) - { - tb.Error(this); - } - tb.PopStackToClose(name); - } - } - else if (StringUtil.In(name, "h1", "h2", "h3", "h4", "h5", "h6")) - { - if (!tb.InScope(new string[] { "h1", "h2", "h3", "h4", "h5", "h6" })) - { - tb.Error(this); - return false; - } - else - { - tb.GenerateImpliedEndTags(name); - if (!tb.CurrentElement.NodeName.Equals(name)) - { - tb.Error(this); - } - tb.PopStackToClose("h1", "h2", "h3", "h4", "h5", "h6"); - } - } - else if (name.Equals("sarcasm")) - { - // *sigh* - return AnyOtherEndTag(t, tb); - } - else if (StringUtil.In(name, - "a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u")) - { - // Adoption Agency Algorithm. - //OUTER: - for (int i = 0; i < 8; i++) - { - Element formatEl = tb.GetActiveFormattingElement(name); - if (formatEl == null) - { - return AnyOtherEndTag(t, tb); - } - else if (!tb.OnStack(formatEl)) - { - tb.Error(this); - tb.RemoveFromActiveFormattingElements(formatEl); - return true; - } - else if (!tb.InScope(formatEl.NodeName)) - { - tb.Error(this); - return false; - } - else if (tb.CurrentElement != formatEl) - { - tb.Error(this); - } - - Element furthestBlock = null; - Element commonAncestor = null; - bool seenFormattingElement = false; - LinkedList stack = tb.Stack; - for (int si = 0; si < stack.Count; si++) - { - Element el = stack.ElementAt(si); - if (el == formatEl) - { - commonAncestor = stack.ElementAt(si - 1); - seenFormattingElement = true; - } - else if (seenFormattingElement && tb.IsSpecial(el)) - { - furthestBlock = el; - break; - } - } - - if (furthestBlock == null) - { - tb.PopStackToClose(formatEl.NodeName); - tb.RemoveFromActiveFormattingElements(formatEl); - return true; - } - - // todo: Let a bookmark note the position of the formatting element in the list of active formatting elements relative to the elements on either side of it in the list. - // does that mean: int pos of format el in list? - Element node = furthestBlock; - Element lastNode = furthestBlock; - - for (int j = 0; j < 3; j++) - { - if (tb.OnStack(node)) - node = tb.AboveOnStack(node); - if (!tb.IsInActiveFormattingElements(node)) - { // note no bookmark check - tb.RemoveFromStack(node); - continue; - } - else if (node == formatEl) - { - break; - } - - Element replacement = new Element(Tag.ValueOf(node.NodeName), tb.BaseUri); - tb.ReplaceActiveFormattingElement(node, replacement); - tb.ReplaceOnStack(node, replacement); - node = replacement; - - if (lastNode == furthestBlock) - { - // todo: move the aforementioned bookmark to be immediately after the new node in the list of active formatting elements. - // not getting how this bookmark both straddles the element above, but is inbetween here... - } - if (lastNode.Parent != null) - { - lastNode.Remove(); - } - - node.AppendChild(lastNode); - - lastNode = node; - } - - if (StringUtil.In(commonAncestor.NodeName, "table", "tbody", "tfoot", "thead", "tr")) - { - if (lastNode.Parent != null) - { - lastNode.Remove(); - } - - tb.InsertInFosterParent(lastNode); - } - else - { - if (lastNode.Parent != null) - { - lastNode.Remove(); - } - - commonAncestor.AppendChild(lastNode); - } - - Element adopter = new Element(Tag.ValueOf(name), tb.BaseUri); - Node[] childNodes = furthestBlock.ChildNodes.ToArray(); - foreach (Node childNode in childNodes) - { - adopter.AppendChild(childNode); // append will reparent. thus the clone to avvoid concurrent mod. - } - - furthestBlock.AppendChild(adopter); - tb.RemoveFromActiveFormattingElements(formatEl); - // todo: insert the new element into the list of active formatting elements at the position of the aforementioned bookmark. - tb.RemoveFromStack(formatEl); - tb.InsertOnStackAfter(furthestBlock, adopter); - } - } - else if (StringUtil.In(name, "applet", "marquee", "object")) - { - if (!tb.InScope("name")) - { - if (!tb.InScope(name)) - { - tb.Error(this); - return false; - } - - tb.GenerateImpliedEndTags(); - - if (!tb.CurrentElement.NodeName.Equals(name)) - { - tb.Error(this); - } - - tb.PopStackToClose(name); - tb.ClearFormattingElementsToLastMarker(); - } - } - else if (name.Equals("br")) - { - tb.Error(this); - tb.Process(new Token.StartTag("br")); - return false; - } - else - { - return AnyOtherEndTag(t, tb); - } - break; - case Token.TokenType.EOF: - // todo: error if stack contains something not dd, dt, li, p, tbody, td, tfoot, th, thead, tr, body, html - // stop parsing - break; - default: - break; - } - return true; - } - - bool AnyOtherEndTag(Token t, TreeBuilder tb) - { - string name = t.AsEndTag().Name(); - DescendableLinkedList stack = tb.Stack; - IEnumerator it = stack.GetDescendingEnumerator(); - - while (it.MoveNext()) - { - Element node = it.Current; - if (node.NodeName.Equals(name)) - { - tb.GenerateImpliedEndTags(name); - - if (!name.Equals(tb.CurrentElement.NodeName)) - { - tb.Error(this); - } - - tb.PopStackToClose(name); - break; - } - else - { - if (tb.IsSpecial(node)) - { - tb.Error(this); - return false; - } - } - } - - return true; - } - }; - protected class TextState : TreeBuilderState - { - // in script, style etc. normally treated as data tags - public override bool Process(Token t, TreeBuilder tb) - { - if (t.IsCharacter()) - { - tb.Insert(t.AsCharacter()); - } - else if (t.IsEOF()) - { - tb.Error(this); - // if current node is script: already started - tb.Pop(); - tb.Transition(tb.OriginalState); - return tb.Process(t); - } - else if (t.IsEndTag()) - { - // if: An end tag whose tag name is "script" -- scripting nesting level, if evaluating scripts - tb.Pop(); - tb.Transition(tb.OriginalState); - } - return true; - } - }; - protected class InTableState : TreeBuilderState - { - public override bool Process(Token t, TreeBuilder tb) - { - if (t.IsCharacter()) - { - tb.NewPendingTableCharacters(); - tb.MarkInsertionMode(); - tb.Transition(InTableText); - return tb.Process(t); - } - else if (t.IsComment()) - { - tb.Insert(t.AsComment()); - } - else if (t.IsDoctype()) - { - tb.Error(this); - return false; - } - else if (t.IsStartTag()) - { - Token.StartTag startTag = t.AsStartTag(); - string name = startTag.Name(); - if (name.Equals("caption")) - { - tb.ClearStackToTableContext(); - tb.InsertMarkerToFormattingElements(); - tb.Insert(startTag); - tb.Transition(InCaption); - } - else if (name.Equals("colgroup")) - { - tb.ClearStackToTableContext(); - tb.Insert(startTag); - tb.Transition(InColumnGroup); - } - else if (name.Equals("col")) - { - tb.Process(new Token.StartTag("colgroup")); - return tb.Process(t); - } - else if (StringUtil.In(name, "tbody", "tfoot", "thead")) - { - tb.ClearStackToTableContext(); - tb.Insert(startTag); - tb.Transition(InTableBody); - } - else if (StringUtil.In(name, "td", "th", "tr")) - { - tb.Process(new Token.StartTag("tbody")); - return tb.Process(t); - } - else if (name.Equals("table")) - { - tb.Error(this); - bool processed = tb.Process(new Token.EndTag("table")); - if (processed) // only ignored if in fragment - { - return tb.Process(t); - } - } - else if (StringUtil.In(name, "style", "script")) - { - return tb.Process(t, InHead); - } - else if (name.Equals("input")) - { - if (!startTag.Attributes["type"].Equals("hidden", StringComparison.InvariantCultureIgnoreCase)) - { - return AnythingElse(t, tb); - } - else - { - tb.InsertEmpty(startTag); - } - } - else if (name.Equals("form")) - { - tb.Error(this); - if (tb.FormElement != null) - return false; - else - { - Element form = tb.InsertEmpty(startTag); - tb.FormElement = form; - } - } - else - { - return AnythingElse(t, tb); - } - } - else if (t.IsEndTag()) - { - Token.EndTag endTag = t.AsEndTag(); - string name = endTag.Name(); - - if (name.Equals("table")) - { - if (!tb.InTableScope(name)) - { - tb.Error(this); - return false; - } - else - { - tb.PopStackToClose("table"); - } - tb.ResetInsertionMode(); - } - else if (StringUtil.In(name, - "body", "caption", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr")) - { - tb.Error(this); - return false; - } - else - { - return AnythingElse(t, tb); - } - } - else if (t.IsEOF()) - { - if (tb.CurrentElement.NodeName.Equals("html")) - tb.Error(this); - return true; // stops parsing - } - return AnythingElse(t, tb); - } - - bool AnythingElse(Token t, TreeBuilder tb) - { - tb.Error(this); - bool processed = true; - if (StringUtil.In(tb.CurrentElement.NodeName, "table", "tbody", "tfoot", "thead", "tr")) - { - tb.IsFosterInserts = true; - processed = tb.Process(t, InBody); - tb.IsFosterInserts = false; - } - else - { - processed = tb.Process(t, InBody); - } - return processed; - } - }; - protected class InTableTextState : TreeBuilderState - { - public override bool Process(Token t, TreeBuilder tb) - { - switch (t.Type) - { - case Token.TokenType.Character: - Token.Character c = t.AsCharacter(); - if (c.Data.ToString().Equals(_nullString)) - { - tb.Error(this); - return false; - } - else - { - tb.PendingTableCharacters.Add(c); - } - break; - default: - if (tb.PendingTableCharacters.Count > 0) - { - foreach (Token.Character character in tb.PendingTableCharacters) - { - if (!IsWhitespace(character)) - { - // InTable anything else section: - tb.Error(this); - if (StringUtil.In(tb.CurrentElement.NodeName, "table", "tbody", "tfoot", "thead", "tr")) - { - tb.IsFosterInserts = true; - tb.Process(character, InBody); - tb.IsFosterInserts = false; - } - else - { - tb.Process(character, InBody); - } - } - else - { - tb.Insert(character); - } - } - tb.NewPendingTableCharacters(); - } - tb.Transition(tb.OriginalState); - return tb.Process(t); - } - return true; - } - }; - protected class InCaptionState : TreeBuilderState - { - public override bool Process(Token t, TreeBuilder tb) - { - if (t.IsEndTag() && t.AsEndTag().Name().Equals("caption")) - { - Token.EndTag endTag = t.AsEndTag(); - string name = endTag.Name(); - - if (!tb.InTableScope(name)) - { - tb.Error(this); - return false; - } - else - { - tb.GenerateImpliedEndTags(); - if (!tb.CurrentElement.NodeName.Equals("caption")) - { - tb.Error(this); - } - tb.PopStackToClose("caption"); - tb.ClearFormattingElementsToLastMarker(); - tb.Transition(InTable); - } - } - else if ((t.IsStartTag() && - StringUtil.In(t.AsStartTag().Name(), "caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr") || - t.IsEndTag() && - t.AsEndTag().Name().Equals("table"))) - { - tb.Error(this); - bool processed = tb.Process(new Token.EndTag("caption")); - - if (processed) - { - return tb.Process(t); - } - } - else if (t.IsEndTag() && StringUtil.In(t.AsEndTag().Name(), - "body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr")) - { - tb.Error(this); - return false; - } - else - { - return tb.Process(t, InBody); - } - return true; - } - }; - protected class InColumnGroupState : TreeBuilderState - { - public override bool Process(Token t, TreeBuilder tb) - { - if (IsWhitespace(t)) - { - tb.Insert(t.AsCharacter()); - return true; - } - - switch (t.Type) - { - case Token.TokenType.Comment: - - tb.Insert(t.AsComment()); - break; - case Token.TokenType.Doctype: - tb.Error(this); - break; - case Token.TokenType.StartTag: - - Token.StartTag startTag = t.AsStartTag(); - String name = startTag.Name(); - if (name.Equals("html")) - { - return tb.Process(t, InBody); - } - else if (name.Equals("col")) - { - tb.InsertEmpty(startTag); - } - else - { - return AnythingElse(t, tb); - } - break; - case Token.TokenType.EndTag: - Token.EndTag endTag = t.AsEndTag(); - name = endTag.Name(); - if (name.Equals("colgroup")) - { - if (tb.CurrentElement.NodeName.Equals("html")) - { // frag case - tb.Error(this); - return false; - } - else - { - tb.Pop(); - tb.Transition(InTable); - } - } - else - { - return AnythingElse(t, tb); - } - break; - case Token.TokenType.EOF: - if (tb.CurrentElement.NodeName.Equals("html")) - { - return true; // stop parsing; frag case - } - else - { - return AnythingElse(t, tb); - } - default: - return AnythingElse(t, tb); - } - return true; - } - - private bool AnythingElse(Token t, TreeBuilder tb) - { - bool processed = tb.Process(new Token.EndTag("colgroup")); - if (processed) // only ignored in frag case - { - return tb.Process(t); - } - return true; - } - }; - protected class InTableBodyState : TreeBuilderState - { - public override bool Process(Token t, TreeBuilder tb) - { - switch (t.Type) - { - case Token.TokenType.StartTag: - Token.StartTag startTag = t.AsStartTag(); - string name = startTag.Name(); - if (name.Equals("tr")) - { - tb.ClearStackToTableBodyContext(); - tb.Insert(startTag); - tb.Transition(InRow); - } - else if (StringUtil.In(name, "th", "td")) - { - tb.Error(this); - tb.Process(new Token.StartTag("tr")); - return tb.Process(startTag); - } - else if (StringUtil.In(name, "caption", "col", "colgroup", "tbody", "tfoot", "thead")) - { - return ExitTableBody(t, tb); - } - else - { - return AnythingElse(t, tb); - } - break; - case Token.TokenType.EndTag: - Token.EndTag endTag = t.AsEndTag(); - name = endTag.Name(); - if (StringUtil.In(name, "tbody", "tfoot", "thead")) - { - if (!tb.InTableScope(name)) - { - tb.Error(this); - return false; - } - else - { - tb.ClearStackToTableBodyContext(); - tb.Pop(); - tb.Transition(InTable); - } - } - else if (name.Equals("table")) - { - return ExitTableBody(t, tb); - } - else if (StringUtil.In(name, "body", "caption", "col", "colgroup", "html", "td", "th", "tr")) - { - tb.Error(this); - return false; - } - else - { - return AnythingElse(t, tb); - } - break; - default: - return AnythingElse(t, tb); - } - return true; - } - - private bool ExitTableBody(Token t, TreeBuilder tb) - { - if (!(tb.InTableScope("tbody") || tb.InTableScope("thead") || tb.InScope("tfoot"))) - { - // frag case - tb.Error(this); - return false; - } - tb.ClearStackToTableBodyContext(); - tb.Process(new Token.EndTag(tb.CurrentElement.NodeName)); // tbody, tfoot, thead - return tb.Process(t); - } - - private bool AnythingElse(Token t, TreeBuilder tb) - { - return tb.Process(t, InTable); - } - }; - protected class InRowState : TreeBuilderState - { - public override bool Process(Token t, TreeBuilder tb) - { - if (t.IsStartTag()) - { - Token.StartTag startTag = t.AsStartTag(); - String name = startTag.Name(); - - if (StringUtil.In(name, "th", "td")) - { - tb.ClearStackToTableRowContext(); - tb.Insert(startTag); - tb.Transition(InCell); - tb.InsertMarkerToFormattingElements(); - } - else if (StringUtil.In(name, "caption", "col", "colgroup", "tbody", "tfoot", "thead", "tr")) - { - return HandleMissingTr(t, tb); - } - else - { - return AnythingElse(t, tb); - } - } - else if (t.IsEndTag()) - { - Token.EndTag endTag = t.AsEndTag(); - string name = endTag.Name(); - - if (name.Equals("tr")) - { - if (!tb.InTableScope(name)) - { - tb.Error(this); // frag - return false; - } - tb.ClearStackToTableRowContext(); - tb.Pop(); // tr - tb.Transition(InTableBody); - } - else if (name.Equals("table")) - { - return HandleMissingTr(t, tb); - } - else if (StringUtil.In(name, "tbody", "tfoot", "thead")) - { - if (!tb.InTableScope(name)) - { - tb.Error(this); - return false; - } - tb.Process(new Token.EndTag("tr")); - return tb.Process(t); - } - else if (StringUtil.In(name, "body", "caption", "col", "colgroup", "html", "td", "th")) - { - tb.Error(this); - return false; - } - else - { - return AnythingElse(t, tb); - } - } - else - { - return AnythingElse(t, tb); - } - return true; - } - - private bool AnythingElse(Token t, TreeBuilder tb) - { - return tb.Process(t, InTable); - } - - private bool HandleMissingTr(Token t, TreeBuilder tb) - { - bool processed = tb.Process(new Token.EndTag("tr")); - if (processed) - { - return tb.Process(t); - } - else - return false; - } - }; - protected class InCellState : TreeBuilderState - { - public override bool Process(Token t, TreeBuilder tb) - { - if (t.IsEndTag()) - { - Token.EndTag endTag = t.AsEndTag(); - String name = endTag.Name(); - - if (StringUtil.In(name, "td", "th")) - { - if (!tb.InTableScope(name)) - { - tb.Error(this); - tb.Transition(InRow); // might not be in scope if empty: and processing fake end tag - return false; - } - tb.GenerateImpliedEndTags(); - if (!tb.CurrentElement.NodeName.Equals(name)) - { - tb.Error(this); - } - tb.PopStackToClose(name); - tb.ClearFormattingElementsToLastMarker(); - tb.Transition(InRow); - } - else if (StringUtil.In(name, "body", "caption", "col", "colgroup", "html")) - { - tb.Error(this); - return false; - } - else if (StringUtil.In(name, "table", "tbody", "tfoot", "thead", "tr")) - { - if (!tb.InTableScope(name)) - { - tb.Error(this); - return false; - } - CloseCell(tb); - return tb.Process(t); - } - else - { - return AnythingElse(t, tb); - } - } - else if (t.IsStartTag() && - StringUtil.In(t.AsStartTag().Name(), - "caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr")) - { - if (!(tb.InTableScope("td") || tb.InTableScope("th"))) - { - tb.Error(this); - return false; - } - CloseCell(tb); - return tb.Process(t); - } - else - { - return AnythingElse(t, tb); - } - return true; - } - - private bool AnythingElse(Token t, TreeBuilder tb) - { - return tb.Process(t, InBody); - } - - private void CloseCell(TreeBuilder tb) - { - if (tb.InTableScope("td")) - { - tb.Process(new Token.EndTag("td")); - } - else - { - tb.Process(new Token.EndTag("th")); // only here if th or td in scope - } - } - }; - protected class InSelectState : TreeBuilderState - { - public override bool Process(Token t, TreeBuilder tb) - { - switch (t.Type) - { - case Token.TokenType.Character: - - Token.Character c = t.AsCharacter(); - if (c.Data.ToString().Equals(_nullString)) - { - tb.Error(this); - return false; - } - else - { - tb.Insert(c); - } - break; - case Token.TokenType.Comment: - tb.Insert(t.AsComment()); - break; - case Token.TokenType.Doctype: - tb.Error(this); - return false; - case Token.TokenType.StartTag: - Token.StartTag start = t.AsStartTag(); - string name = start.Name(); - if (name.Equals("html")) - return tb.Process(start, InBody); - else if (name.Equals("option")) - { - tb.Process(new Token.EndTag("option")); - tb.Insert(start); - } - else if (name.Equals("optgroup")) - { - if (tb.CurrentElement.NodeName.Equals("option")) - { - tb.Process(new Token.EndTag("option")); - } - else if (tb.CurrentElement.NodeName.Equals("optgroup")) - { - tb.Process(new Token.EndTag("optgroup")); - } - tb.Insert(start); - } - else if (name.Equals("select")) - { - tb.Error(this); - return tb.Process(new Token.EndTag("select")); - } - else if (StringUtil.In(name, "input", "keygen", "textarea")) - { - tb.Error(this); - - if (!tb.InSelectScope("select")) - { - return false; // frag - } - - tb.Process(new Token.EndTag("select")); - return tb.Process(start); - } - else if (name.Equals("script")) - { - return tb.Process(t, InHead); - } - else - { - return AnythingElse(t, tb); - } - break; - case Token.TokenType.EndTag: - Token.EndTag end = t.AsEndTag(); - name = end.Name(); - if (name.Equals("optgroup")) - { - if (tb.CurrentElement.NodeName.Equals("option") && tb.AboveOnStack(tb.CurrentElement) != null && tb.AboveOnStack(tb.CurrentElement).NodeName.Equals("optgroup")) - { - tb.Process(new Token.EndTag("option")); - } - if (tb.CurrentElement.NodeName.Equals("optgroup")) - { - tb.Pop(); - } - else - { - tb.Error(this); - } - } - else if (name.Equals("option")) - { - if (tb.CurrentElement.NodeName.Equals("option")) - { - tb.Pop(); - } - else - { - tb.Error(this); - } - } - else if (name.Equals("select")) - { - if (!tb.InSelectScope(name)) - { - tb.Error(this); - return false; - } - else - { - tb.PopStackToClose(name); - tb.ResetInsertionMode(); - } - } - else - { - return AnythingElse(t, tb); - } - break; - case Token.TokenType.EOF: - if (!tb.CurrentElement.NodeName.Equals("html")) - { - tb.Error(this); - } - break; - default: - return AnythingElse(t, tb); - } - return true; - } - - private bool AnythingElse(Token t, TreeBuilder tb) - { - tb.Error(this); - return false; - } - }; - protected class InSelectInTableState : TreeBuilderState - { - public override bool Process(Token t, TreeBuilder tb) - { - if (t.IsStartTag() && StringUtil.In(t.AsStartTag().Name(), "caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th")) - { - tb.Error(this); - tb.Process(new Token.EndTag("select")); - return tb.Process(t); - } - else if (t.IsEndTag() && StringUtil.In(t.AsEndTag().Name(), "caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th")) - { - tb.Error(this); - if (tb.InTableScope(t.AsEndTag().Name())) - { - tb.Process(new Token.EndTag("select")); - return (tb.Process(t)); - } - else - { - return false; - } - } - else - { - return tb.Process(t, InSelect); - } - } - }; - protected class AfterBodyState : TreeBuilderState - { - public override bool Process(Token t, TreeBuilder tb) - { - if (IsWhitespace(t)) - { - return tb.Process(t, InBody); - } - else if (t.IsComment()) - { - tb.Insert(t.AsComment()); // into html node - } - else if (t.IsDoctype()) - { - tb.Error(this); - return false; - } - else if (t.IsStartTag() && t.AsStartTag().Name().Equals("html")) - { - return tb.Process(t, InBody); - } - else if (t.IsEndTag() && t.AsEndTag().Name().Equals("html")) - { - if (tb.IsFragmentParsing()) - { - tb.Error(this); - return false; - } - else - { - tb.Transition(AfterAfterBody); - } - } - else if (t.IsEOF()) - { - // chillax! we're done - } - else - { - tb.Error(this); - tb.Transition(InBody); - return tb.Process(t); - } - return true; - } - }; - protected class InFramesetState : TreeBuilderState - { - public override bool Process(Token t, TreeBuilder tb) - { - if (IsWhitespace(t)) - { - tb.Insert(t.AsCharacter()); - } - else if (t.IsComment()) - { - tb.Insert(t.AsComment()); - } - else if (t.IsDoctype()) - { - tb.Error(this); - return false; - } - else if (t.IsStartTag()) - { - Token.StartTag start = t.AsStartTag(); - string name = start.Name(); - if (name.Equals("html")) - { - return tb.Process(start, InBody); - } - else if (name.Equals("frameset")) - { - tb.Insert(start); - } - else if (name.Equals("frame")) - { - tb.InsertEmpty(start); - } - else if (name.Equals("noframes")) - { - return tb.Process(start, InHead); - } - else - { - tb.Error(this); - return false; - } - } - else if (t.IsEndTag() && t.AsEndTag().Name().Equals("frameset")) - { - if (tb.CurrentElement.NodeName.Equals("html")) - { // frag - tb.Error(this); - return false; - } - else - { - tb.Pop(); - if (!tb.IsFragmentParsing() && !tb.CurrentElement.NodeName.Equals("frameset")) - { - tb.Transition(AfterFrameset); - } - } - } - else if (t.IsEOF()) - { - if (!tb.CurrentElement.NodeName.Equals("html")) - { - tb.Error(this); - return true; - } - } - else - { - tb.Error(this); - return false; - } - return true; - } - }; - protected class AfterFramesetState : TreeBuilderState - { - public override bool Process(Token t, TreeBuilder tb) - { - if (IsWhitespace(t)) - { - tb.Insert(t.AsCharacter()); - } - else if (t.IsComment()) - { - tb.Insert(t.AsComment()); - } - else if (t.IsDoctype()) - { - tb.Error(this); - return false; - } - else if (t.IsStartTag() && t.AsStartTag().Name().Equals("html")) - { - return tb.Process(t, InBody); - } - else if (t.IsEndTag() && t.AsEndTag().Name().Equals("html")) - { - tb.Transition(AfterAfterFrameset); - } - else if (t.IsStartTag() && t.AsStartTag().Name().Equals("noframes")) - { - return tb.Process(t, InHead); - } - else if (t.IsEOF()) - { - // cool your heels, we're complete - } - else - { - tb.Error(this); - return false; - } - return true; - } - }; - protected class AfterAfterBodyState : TreeBuilderState - { - public override bool Process(Token t, TreeBuilder tb) - { - if (t.IsComment()) - { - tb.Insert(t.AsComment()); - } - else if (t.IsDoctype() || IsWhitespace(t) || (t.IsStartTag() && t.AsStartTag().Name().Equals("html"))) - { - return tb.Process(t, InBody); - } - else if (t.IsEOF()) - { - // nice work chuck - } - else - { - tb.Error(this); - tb.Transition(InBody); - return tb.Process(t); - } - return true; - } - }; - protected class AfterAfterFramesetState : TreeBuilderState - { - public override bool Process(Token t, TreeBuilder tb) - { - if (t.IsComment()) - { - tb.Insert(t.AsComment()); - } - else if (t.IsDoctype() || IsWhitespace(t) || (t.IsStartTag() && t.AsStartTag().Name().Equals("html"))) - { - return tb.Process(t, InBody); - } - else if (t.IsEOF()) - { - // nice work chuck - } - else if (t.IsStartTag() && t.AsStartTag().Name().Equals("nofrmes")) - { - return tb.Process(t, InHead); - } - else - { - tb.Error(this); - tb.Transition(InBody); - return tb.Process(t); - } - return true; - } - }; - protected class ForeignContentState : TreeBuilderState - { - public override bool Process(Token t, TreeBuilder tb) - { - return true; - // todo: implement. Also; how do we get here? - } - }; - - #endregion - - public static readonly TreeBuilderState Initial = new InitialState(); - public static readonly TreeBuilderState BeforeHtml = new BeforeHtmlState(); - public static readonly TreeBuilderState BeforeHead = new BeforeHeadState(); - public static readonly TreeBuilderState InHead = new InHeadState(); - public static readonly TreeBuilderState InHeadNoscript = new InHeadNoscriptState(); - public static readonly TreeBuilderState AfterHead = new AfterHeadState(); - public static readonly TreeBuilderState InBody = new InBodyState(); - public static readonly TreeBuilderState Text = new TextState(); - public static readonly TreeBuilderState InTable = new InTableState(); - public static readonly TreeBuilderState InTableText = new InTableTextState(); - public static readonly TreeBuilderState InCaption = new InCaptionState(); - public static readonly TreeBuilderState InColumnGroup = new InColumnGroupState(); - public static readonly TreeBuilderState InTableBody = new InTableBodyState(); - public static readonly TreeBuilderState InRow = new InRowState(); - public static readonly TreeBuilderState InCell = new InCellState(); - public static readonly TreeBuilderState InSelect = new InSelectState(); - public static readonly TreeBuilderState InSelectInTable = new InSelectInTableState(); - public static readonly TreeBuilderState AfterBody = new AfterBodyState(); - public static readonly TreeBuilderState InFrameset = new InFramesetState(); - public static readonly TreeBuilderState AfterFrameset = new AfterFramesetState(); - public static readonly TreeBuilderState AfterAfterBody = new AfterAfterBodyState(); - public static readonly TreeBuilderState AfterAfterFrameset = new AfterAfterFramesetState(); - public static readonly TreeBuilderState ForeignContent = new ForeignContentState(); - - protected static string _nullString = "\u0000"; - - public abstract bool Process(Token t, TreeBuilder tb); - - protected bool IsWhitespace(Token t) - { - if (t.IsCharacter()) - { - string data = t.AsCharacter().Data.ToString(); - // todo: this checks more than spec - "\t", "\n", "\f", "\r", " " - for (int i = 0; i < data.Length; i++) - { - char c = data[i]; - if (!char.IsWhiteSpace(c)) - { - return false; - } - } - return true; - } - return false; - } - - protected void HandleRcData(Token.StartTag startTag, TreeBuilder tb) - { - tb.Insert(startTag); - tb.Tokeniser.Transition(TokeniserState.RcData); - tb.MarkInsertionMode(); - tb.Transition(Text); - } - - protected void HandleRawText(Token.StartTag startTag, TreeBuilder tb) - { - tb.Insert(startTag); - tb.Tokeniser.Transition(TokeniserState.RawText); - tb.MarkInsertionMode(); - tb.Transition(Text); - } - } -} diff --git a/NSoup/Parse/XmlTreeBuilder.cs b/NSoup/Parse/XmlTreeBuilder.cs index 8324d4d..7642de5 100644 --- a/NSoup/Parse/XmlTreeBuilder.cs +++ b/NSoup/Parse/XmlTreeBuilder.cs @@ -1,8 +1,6 @@ using NSoup.Nodes; using System; using System.Collections.Generic; -using System.Linq; -using System.Text; namespace NSoup.Parse { diff --git a/NSoup/Properties/AssemblyInfo.cs b/NSoup/Properties/AssemblyInfo.cs deleted file mode 100644 index b96d02a..0000000 --- a/NSoup/Properties/AssemblyInfo.cs +++ /dev/null @@ -1,41 +0,0 @@ -using System.Reflection; -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; - -// General Information about an assembly is controlled through the following -// set of attributes. Change these attribute values to modify the information -// associated with an assembly. -[assembly: AssemblyTitle("NSoup")] -[assembly: AssemblyDescription("")] -[assembly: AssemblyConfiguration("")] -[assembly: AssemblyCompany("")] -[assembly: AssemblyProduct("NSoup")] -[assembly: AssemblyCopyright("Copyright © 2012 Amir Grozki")] -[assembly: AssemblyTrademark("")] -[assembly: AssemblyCulture("")] - -// Setting ComVisible to false makes the types in this assembly not visible -// to COM components. If you need to access a type in this assembly from -// COM, set the ComVisible attribute to true on that type. -[assembly: ComVisible(false)] - -// The following GUID is for the ID of the typelib if this project is exposed to COM -[assembly: Guid("ea7c4382-d4cf-470b-9525-72d42850c656")] - -// Added by popular demand. -[assembly: System.Security.AllowPartiallyTrustedCallers] - -[assembly: InternalsVisibleTo("Test, PublicKey=0024000004800000940000000602000000240000525341310004000001000100ff2769904d9601c999569e2ebea98b0822f0c58cb1a59d26ac1b3f0a9361cc58217d8c119b0cd0b8b16cb74e470f1b3c50334cefee49b3a0f9ca5830f418584c7ae8f9860a5fe91cdc2a51e8db16d8d1575c053c2e24bca46f644dcf12b3633829077a0ff5e68e4ca0491b9fbba3b19a06eb2887251bcb186486e2d2ddcbb3b5")] - -// Version information for an assembly consists of the following four values: -// -// Major Version -// Minor Version -// Build Number -// Revision -// -// You can specify all the values or you can default the Build and Revision Numbers -// by using the '*' as shown below: -// [assembly: AssemblyVersion("1.0.*")] -[assembly: AssemblyVersion("0.8.0.0")] -[assembly: AssemblyFileVersion("0.8.0.0")] diff --git a/NSoup/Safety/Cleaner.cs b/NSoup/Safety/Cleaner.cs index dd5b4ef..8b019fe 100644 --- a/NSoup/Safety/Cleaner.cs +++ b/NSoup/Safety/Cleaner.cs @@ -1,9 +1,7 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using NSoup.Nodes; +using NSoup.Nodes; using NSoup.Parse; +using System; +using System.Collections.Generic; namespace NSoup.Safety { diff --git a/NSoup/Safety/Whitelist.cs b/NSoup/Safety/Whitelist.cs index edc210b..42370d2 100644 --- a/NSoup/Safety/Whitelist.cs +++ b/NSoup/Safety/Whitelist.cs @@ -1,8 +1,6 @@ -using System; +using NSoup.Nodes; +using System; using System.Collections.Generic; -using System.Linq; -using System.Text; -using NSoup.Nodes; namespace NSoup.Safety { diff --git a/NSoup/Select/Collector.cs b/NSoup/Select/Collector.cs index 25cc295..f324957 100644 --- a/NSoup/Select/Collector.cs +++ b/NSoup/Select/Collector.cs @@ -1,8 +1,4 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using NSoup.Nodes; +using NSoup.Nodes; namespace NSoup.Select { diff --git a/NSoup/Select/CombiningEvaluator.cs b/NSoup/Select/CombiningEvaluator.cs index 95d86c6..76c9e8a 100644 --- a/NSoup/Select/CombiningEvaluator.cs +++ b/NSoup/Select/CombiningEvaluator.cs @@ -1,8 +1,6 @@ -using System; +using NSoup.Nodes; using System.Collections.Generic; using System.Linq; -using System.Text; -using NSoup.Nodes; namespace NSoup.Select { @@ -89,7 +87,7 @@ public Or(ICollection evaluators) public Or() : base() - {} + { } public void Add(Evaluator e) { diff --git a/NSoup/Select/Elements.cs b/NSoup/Select/Elements.cs index c9af7c9..2e34a89 100644 --- a/NSoup/Select/Elements.cs +++ b/NSoup/Select/Elements.cs @@ -1,8 +1,8 @@ -using System; +using NSoup.Nodes; +using System; using System.Collections.Generic; using System.Linq; using System.Text; -using NSoup.Nodes; namespace NSoup.Select { @@ -327,7 +327,8 @@ public override string ToString() /// the new tag name /// this, for chaining /// - public Elements TagName(string tagName) { + public Elements TagName(string tagName) + { foreach (Element element in _contents) { element.TagName(tagName); diff --git a/NSoup/Select/Evaluator.cs b/NSoup/Select/Evaluator.cs index 9b18c51..cac91cf 100644 --- a/NSoup/Select/Evaluator.cs +++ b/NSoup/Select/Evaluator.cs @@ -1,9 +1,6 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; +using NSoup.Nodes; +using System; using System.Text.RegularExpressions; -using NSoup.Nodes; namespace NSoup.Select { diff --git a/NSoup/Select/NodeTraversor.cs b/NSoup/Select/NodeTraversor.cs index 1f5a168..b3c5c4e 100644 --- a/NSoup/Select/NodeTraversor.cs +++ b/NSoup/Select/NodeTraversor.cs @@ -1,8 +1,4 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using NSoup.Nodes; +using NSoup.Nodes; namespace NSoup.Select { diff --git a/NSoup/Select/NodeVisitor.cs b/NSoup/Select/NodeVisitor.cs index 7401b7c..b4186ff 100644 --- a/NSoup/Select/NodeVisitor.cs +++ b/NSoup/Select/NodeVisitor.cs @@ -1,8 +1,4 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using NSoup.Nodes; +using NSoup.Nodes; namespace NSoup.Select { diff --git a/NSoup/Select/QueryParser.cs b/NSoup/Select/QueryParser.cs index 67afa96..213c037 100644 --- a/NSoup/Select/QueryParser.cs +++ b/NSoup/Select/QueryParser.cs @@ -1,8 +1,7 @@ -using System; +using NSoup.Parse; +using System; using System.Collections.Generic; -using System.Linq; using System.Text; -using NSoup.Parse; using System.Text.RegularExpressions; namespace NSoup.Select @@ -95,17 +94,19 @@ private void Combinator(char combinator) Evaluator newEval = Parse(subQuery); // the evaluator to add into target evaluator bool replaceRightMost = false; - if (_evals.Count == 1) { - rootEval = currentEval = _evals[0]; - // make sure OR (,) has precedence: - if (rootEval is CombiningEvaluator.Or && combinator != ',') - { - currentEval = ((CombiningEvaluator.Or)currentEval).RightMostEvaluator(); - replaceRightMost = true; - } - } - else { - rootEval = currentEval = new CombiningEvaluator.And(_evals); + if (_evals.Count == 1) + { + rootEval = currentEval = _evals[0]; + // make sure OR (,) has precedence: + if (rootEval is CombiningEvaluator.Or && combinator != ',') + { + currentEval = ((CombiningEvaluator.Or)currentEval).RightMostEvaluator(); + replaceRightMost = true; + } + } + else + { + rootEval = currentEval = new CombiningEvaluator.And(_evals); } _evals.Clear(); diff --git a/NSoup/Select/Selector.cs b/NSoup/Select/Selector.cs index 23dd13d..97f15ce 100644 --- a/NSoup/Select/Selector.cs +++ b/NSoup/Select/Selector.cs @@ -1,9 +1,6 @@ -using System; +using NSoup.Nodes; +using System; using System.Collections.Generic; -using System.Linq; -using System.Text; -using NSoup.Nodes; -using NSoup.Parse; namespace NSoup.Select { diff --git a/NSoup/Select/StructuralEvaluator.cs b/NSoup/Select/StructuralEvaluator.cs index 058d5ac..b2253e5 100644 --- a/NSoup/Select/StructuralEvaluator.cs +++ b/NSoup/Select/StructuralEvaluator.cs @@ -1,8 +1,4 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using NSoup.Nodes; +using NSoup.Nodes; namespace NSoup.Select { diff --git a/NSoup/UnsupportedMimeTypeException.cs b/NSoup/UnsupportedMimeTypeException.cs index 1dc1e41..6b48f47 100644 --- a/NSoup/UnsupportedMimeTypeException.cs +++ b/NSoup/UnsupportedMimeTypeException.cs @@ -1,8 +1,4 @@ -using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Text; +using System.IO; namespace NSoup { @@ -36,4 +32,4 @@ public override string ToString() return base.ToString() + ". Mimetype=" + MimeType + ", URL=" + Url; } } -} +} diff --git a/Test/Test.csproj b/Test/Test.csproj index 7a2287b..3b6a162 100644 --- a/Test/Test.csproj +++ b/Test/Test.csproj @@ -8,7 +8,7 @@ Properties Test Test - v4.6 + v4.7 512 {3AC096D0-A1C2-E12C-1390-A8335801FDAB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} 10.0 From 7b0d72777dc870128ce231ab53858dcc9f52f07c Mon Sep 17 00:00:00 2001 From: Milen Date: Tue, 9 Jan 2018 17:12:22 +0800 Subject: [PATCH 2/7] Update README.md --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index db40de6..401461e 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,9 @@ NSoup is a .NET port of the jsoup (https://github.com/jhy/jsoup) HTML parser and jsoup originally written by [Jonathan Hedley](https://github.com/jhy). Ported to .NET by Amir Grozki. -**NOTE**: (2013-07-10) In the last few months I've been struggling with a few tests crashing for some reason I cannot isolate. I've pushed the latest version of the source code, and if anyone can help solve those issues it would greatly help this project. +**NOTE**: +(2018-01-09) supported .NET Standard 2.0 by [Milen](https://github.com/milenstack) +(2013-07-10) In the last few months I've been struggling with a few tests crashing for some reason I cannot isolate. I've pushed the latest version of the source code, and if anyone can help solve those issues it would greatly help this project. ## Features From bc8eb736743a7dd0a8de44b76a2b58e95f26a56e Mon Sep 17 00:00:00 2001 From: Milen Date: Wed, 10 Jan 2018 11:46:48 +0800 Subject: [PATCH 3/7] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 401461e..2714faa 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ Ported to .NET by Amir Grozki. **NOTE**: (2018-01-09) supported .NET Standard 2.0 by [Milen](https://github.com/milenstack) + (2013-07-10) In the last few months I've been struggling with a few tests crashing for some reason I cannot isolate. I've pushed the latest version of the source code, and if anyone can help solve those issues it would greatly help this project. ## Features From 4ef493136200c1a38272417360a43b77461d9ae9 Mon Sep 17 00:00:00 2001 From: Milen Date: Wed, 10 Jan 2018 11:47:09 +0800 Subject: [PATCH 4/7] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 2714faa..47d23bc 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ jsoup originally written by [Jonathan Hedley](https://github.com/jhy). Ported to .NET by Amir Grozki. **NOTE**: + (2018-01-09) supported .NET Standard 2.0 by [Milen](https://github.com/milenstack) (2013-07-10) In the last few months I've been struggling with a few tests crashing for some reason I cannot isolate. I've pushed the latest version of the source code, and if anyone can help solve those issues it would greatly help this project. From 73534f677e35cb01989e915db706d5af096f0f04 Mon Sep 17 00:00:00 2001 From: Milen Date: Wed, 10 Jan 2018 14:33:34 +0800 Subject: [PATCH 5/7] --- .gitignore | 1 + NSoup/NSoup.csproj | 19 +++++++++++++++++-- NSoup/NSoup.snk | Bin 0 -> 596 bytes 3 files changed, 18 insertions(+), 2 deletions(-) create mode 100644 NSoup/NSoup.snk diff --git a/.gitignore b/.gitignore index fc0a7bc..e60c60e 100644 --- a/.gitignore +++ b/.gitignore @@ -110,3 +110,4 @@ UpgradeLog*.XML .sass-cache .sass-cache/* /.vs/NSoup/v15/Server/sqlite3 +/.vs diff --git a/NSoup/NSoup.csproj b/NSoup/NSoup.csproj index 4474604..1afe540 100644 --- a/NSoup/NSoup.csproj +++ b/NSoup/NSoup.csproj @@ -2,9 +2,24 @@ netstandard2.0 - false - 2.0.0 + true + 2.0.1 true + NSoup.snk + NSoup.Standard + + + milenstack + 2.0.0.1 + 2.0.0.1 + NSoup.Standard + NSoup is a .NET port of the jsoup (http://jsoup.org) HTML parser and sanitizer originally written in Java. + https://github.com/milenstack/NSoup + https://github.com/GeReV/NSoup/blob/master/LICENSE + + forked from https://github.com/GeReV/NSoup +update .NET Standard 2.0 +source: https://github.com/milenstack/NSoup diff --git a/NSoup/NSoup.snk b/NSoup/NSoup.snk new file mode 100644 index 0000000000000000000000000000000000000000..9a09a7830c5bd4bb8a57177e2b68ad235f698cf7 GIT binary patch literal 596 zcmV-a0;~N80ssI2Bme+XQ$aES1ONa50098CUc4|T2}tmm`D#Ti9h<7zAVV7KOBwq3 z#9Of&QU%VA&}C3Xm?Cx+@sA=5ePF&misyPcZpZJ;N!f-Z%SvSlGg>qM=CKo7O}3Xa zEFuO~*{iZ=m~x+kVFIOuiPXq}Lh6^8=7Fo5>b}ujvt3Q{3mU?iBNH5F1xgpBx{}cw zq3G`@-z}qf}|(M)gh8+)R&^9jsvRXJLS<>NXR7ovl4Z;`|Xj@W6dKajnQ z%wFUDB6{qGLA^m+{pi*1wUvBgaBEmRAIz&=zl zd@bHcpQtOfE13#41#XDnAE1+>1ZjRWxON!=IEBw~E8T(fOwh=Ojm|Xn-Nd?=W#Mz|VU+oJLtI`n^J! iTn^f)*Rx8XZ4<#%l$}+%quOvfH2kIkk^gif7hW8Yp&pe0 literal 0 HcmV?d00001 From 3bacd10e2e7f182154bb1da833fca5c560fc1f67 Mon Sep 17 00:00:00 2001 From: Milen Date: Wed, 10 Jan 2018 14:41:09 +0800 Subject: [PATCH 6/7] Update README.md --- README.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/README.md b/README.md index 47d23bc..a1e51aa 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,3 @@ -**NSoup is currently unmaintained.** - -At this time, I am not actively working on this library. However, I will happily accept any help and pull requests, and perhaps return to working on it, should it gain any more traction. - -The source code has been migrated from CodePlex in the hopes it will get picked up by the GitHub community. It is by now fairly outdated and perhaps should be ported from latest *jsoup* scratch. - # NSoup NSoup is a .NET port of the jsoup (https://github.com/jhy/jsoup) HTML parser and sanitizer originally written in Java. From 93b2d1cfcdf8da3d991c6f57f752ffa57e4827c0 Mon Sep 17 00:00:00 2001 From: Milen Date: Tue, 16 Jan 2018 12:51:12 +0800 Subject: [PATCH 7/7] Update README.md --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a1e51aa..570e1aa 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,9 @@ +# Nuget + +> Install-Package [NSoup.Standard](https://www.nuget.org/packages/NSoup.Standard/) + # NSoup -NSoup is a .NET port of the jsoup (https://github.com/jhy/jsoup) HTML parser and sanitizer originally written in Java. +NSoup is a .NET port of the [jsoup](https://github.com/jhy/jsoup) HTML parser and sanitizer originally written in Java. jsoup originally written by [Jonathan Hedley](https://github.com/jhy). Ported to .NET by Amir Grozki.