1: using System.Text.RegularExpressions;
2: using System.Web;
3:
4: namespace Ia.Cl.Model
5: {
6: ////////////////////////////////////////////////////////////////////////////
7:
8: /// <summary publish="true">
9: /// Handle HTML encoding, decoding functions.
10: /// </summary>
11: /// <remarks>
12: /// Copyright � 2001-2018 Jasem Y. Al-Shamlan (info@ia.com.kw), Integrated Applications - Kuwait. All Rights Reserved.
13: ///
14: /// This library is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by
15: /// the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
16: ///
17: /// This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
18: /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
19: ///
20: /// You should have received a copy of the GNU General Public License along with this library. If not, see http://www.gnu.org/licenses.
21: ///
22: /// Copyright notice: This notice may not be removed or altered from any source distribution.
23: /// </remarks>
24: public class Html
25: {
26: private static readonly Regex regexBetweenTags = new Regex(@">\s+<", RegexOptions.Compiled);
27: private static readonly Regex regexLineBreaks = new Regex(@"\n\s+", RegexOptions.Compiled);
28:
29: ////////////////////////////////////////////////////////////////////////////
30:
31: /// <summary>
32: ///
33: /// </summary>
34: public Html() { }
35:
36: ////////////////////////////////////////////////////////////////////////////
37:
38: ///<summary>
39: ///
40: /// </summary>
41: /// <param name="s"></param>
42: /// <returns></returns>
43: /// <remark>http://www.west-wind.com/weblog/posts/2009/Feb/05/Html-and-Uri-String-Encoding-without-SystemWeb</remark>
44: public static string HtmlEncode(string s)
45: {
46: #if WFA
47:
48: if (s == null) return null;
49:
50: StringBuilder sb = new StringBuilder(s.Length);
51:
52: int len = s.Length;
53:
54: for (int i = 0; i < len; i++)
55: {
56: switch (s[i])
57: {
58: case '<': sb.Append("<"); break;
59: case '>': sb.Append(">"); break;
60: case '"': sb.Append("""); break;
61: case '&': sb.Append("&"); break;
62: default:
63: if (s[i] > 159)
64: {
65: // decimal numeric entity
66: sb.Append("&#");
67: sb.Append(((int)s[i]).ToString(CultureInfo.InvariantCulture));
68: sb.Append(";");
69: }
70: else sb.Append(s[i]);
71: break;
72: }
73: }
74:
75: return sb.ToString();
76: #else
77: return HttpUtility.HtmlEncode(s);
78: #endif
79: }
80:
81: ////////////////////////////////////////////////////////////////////////////
82:
83: ///<summary>
84: ///
85: /// </summary>
86: public static string HtmlDecode(string s)
87: {
88: #if WFA
89: s = s.Replace("<","<");
90: s = s.Replace(">",">");
91: s = s.Replace(""",@"""");
92: s = s.Replace("&","&");
93:
94: return s;
95: #else
96: return HttpUtility.HtmlDecode(s);
97: #endif
98: }
99:
100: ////////////////////////////////////////////////////////////////////////////
101:
102: /// <summary>
103: ///
104: /// </summary>
105: public static string Encode(string s)
106: {
107: s = HtmlEncode(s);
108:
109: // database requirement:
110: s = s.Replace(@"'", @"_#039_");
111: s = s.Replace(@"?", @"_#063_");
112:
113: return s;
114: }
115:
116: ////////////////////////////////////////////////////////////////////////////
117:
118: /// <summary>
119: ///
120: /// </summary>
121: public static string Decode(string s)
122: {
123: // database requirement:
124: s = s.Replace(@"_#063_", @"?");
125: s = s.Replace(@"_#039_", @"'");
126:
127: s = HtmlDecode(s);
128:
129: return s;
130: }
131:
132: ////////////////////////////////////////////////////////////////////////////
133:
134: /// <summary>
135: ///
136: /// </summary>
137: public static string DecodeRemoveNLLF(string s)
138: {
139: // database requirement:
140:
141: s = s.Replace(@"_#063_", @"?");
142: s = s.Replace(@"_#039_", @"'");
143:
144: s = HtmlDecode(s);
145:
146: s = s.Replace("\n\r", " ");
147: s = s.Replace("\r\n", " ");
148: s = s.Replace("\n", " ");
149: s = s.Replace("\r", " ");
150:
151: return s;
152: }
153:
154: ////////////////////////////////////////////////////////////////////////////
155:
156: /// <summary>
157: ///
158: /// </summary>
159: public static string XmlEncode(string s)
160: {
161: s = HtmlEncode(s);
162:
163: s = s.Replace(@"'", @"_#039_");
164: s = s.Replace(@"\", @"_#092_");
165: s = s.Replace(@"?", @"_#063_");
166:
167: /*
168: & = &
169: < = <
170: > = >
171: " = "
172: ' = '
173: */
174:
175: // XML requirement:
176: s = s.Replace("&", "_amp_");
177: s = s.Replace(">", "_gt_");
178: s = s.Replace("<", "_lt_");
179:
180: return s;
181: }
182:
183: ////////////////////////////////////////////////////////////////////////////
184:
185: /// <summary>
186: ///
187: /// </summary>
188: public static string XmlDecode(string s)
189: {
190: // XML requirement
191: s = s.Replace("_gt_", ">");
192: s = s.Replace("_lt_", "<");
193: s = s.Replace("_amp_", "&");
194:
195: s = s.Replace(@"_#039_", @"'");
196: s = s.Replace(@"_#092_", @"\");
197: s = s.Replace(@"_#063_", @"?");
198:
199: s = HtmlDecode(s);
200: return s;
201: }
202:
203: ////////////////////////////////////////////////////////////////////////////
204:
205: /// <summary>
206: ///
207: /// </summary>
208: public static string Code(string code)
209: {
210: // this displays an HTML code in regular text
211: /*
212: s=s.Replace("_gt_",">");
213: s=s.Replace("_lt_","<");
214: s=s.Replace("_amp_","&");
215:
216: s=s.Replace(@"_#039_",@"'");
217: s=s.Replace(@"_#092_",@"\");
218: s=s.Replace(@"_#063_",@"?");
219: */
220:
221: code = HtmlEncode(code);
222: return code;
223: }
224:
225: ////////////////////////////////////////////////////////////////////////////
226:
227: /// <summary>
228: ///
229: /// </summary>
230: public static string StripHtml(string source)
231: {
232: try
233: {
234: string result;
235:
236: // Remove HTML Development formatting
237: // Replace line breaks with space
238: // because browsers inserts space
239: result = source.Replace("\r", " ");
240:
241: // Replace line breaks with space
242: // because browsers inserts space
243: result = result.Replace("\n", " ");
244:
245: // Remove step-formatting
246: result = result.Replace("\t", string.Empty);
247:
248: // Remove repeating speces becuase browsers ignore them
249: result = System.Text.RegularExpressions.Regex.Replace(result, @"( )+", " ");
250:
251: // Remove the header (prepare first by clearing attributes)
252: result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*head([^>])*>", "<head>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
253:
254: result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*head( )*>)", "</head>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
255:
256: result = System.Text.RegularExpressions.Regex.Replace(result, "(<head>).*(</head>)", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
257:
258: // remove all scripts (prepare first by clearing attributes)
259: result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*script([^>])*>", "<script>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
260:
261: result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*script( )*>)", "</script>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
262:
263: //result = System.Text.RegularExpressions.Regex.Replace(result, @"(<script>)([^(<script>\.</script>)])*(</script>)", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
264:
265: result = System.Text.RegularExpressions.Regex.Replace(result, @"(<script>).*(</script>)", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
266:
267: // remove all styles (prepare first by clearing attributes)
268: result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*style([^>])*>", "<style>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
269:
270: result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*style( )*>)", "</style>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
271:
272: result = System.Text.RegularExpressions.Regex.Replace(result, "(<style>).*(</style>)", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
273:
274: // insert tabs in spaces of <td> tags
275: result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*td([^>])*>", "\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
276:
277: // insert line breaks in places of <BR> and <LI> tags
278: result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*br( )*>", "\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
279: result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*li( )*>", "\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
280:
281: // insert line paragraphs (double line breaks) in place
282: // if <P>, <DIV> and <TR> tags
283: result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*div([^>])*>", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
284: result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*tr([^>])*>", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
285: result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*p([^>])*>", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
286:
287: // Remove remaining tags like <a>, links, images, // comments etc - anything thats enclosed inside < >
288: result = System.Text.RegularExpressions.Regex.Replace(result, @"<[^>]*>", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
289:
290: // replace special characters:
291: result = System.Text.RegularExpressions.Regex.Replace(result, @" ", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
292:
293: result = System.Text.RegularExpressions.Regex.Replace(result, @"•", " * ", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
294: result = System.Text.RegularExpressions.Regex.Replace(result, @"‹", "<", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
295: result = System.Text.RegularExpressions.Regex.Replace(result, @"›", ">", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
296: result = System.Text.RegularExpressions.Regex.Replace(result, @"™", "(tm)", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
297: result = System.Text.RegularExpressions.Regex.Replace(result, @"⁄", "/", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
298: result = System.Text.RegularExpressions.Regex.Replace(result, @"<", "<", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
299: result = System.Text.RegularExpressions.Regex.Replace(result, @">", ">", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
300: result = System.Text.RegularExpressions.Regex.Replace(result, @"©", "(c)", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
301: result = System.Text.RegularExpressions.Regex.Replace(result, @"®", "(r)", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
302:
303: result = System.Text.RegularExpressions.Regex.Replace(result, @" ", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
304:
305: // Remove all others. More can be added, see
306: // http://hotwired.lycos.com/webmonkey/reference/special_characters/
307: result = System.Text.RegularExpressions.Regex.Replace(result, @"&(.{2,6});", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
308:
309: // for testng
310: //System.Text.RegularExpressions.Regex.Replace(result,
311: // this.txtRegex.Text,string.Empty,
312: // System.Text.RegularExpressions.RegexOptions.IgnoreCase);
313:
314: // make line breaking consistent
315: result = result.Replace("\n", "\r");
316:
317: // Remove extra line breaks and tabs:
318: // replace over 2 breaks with 2 and over 4 tabs with 4.
319: // Prepare first to remove any whitespaces inbetween
320: // the escaped characters and remove redundant tabs inbetween linebreaks
321: result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)( )+(\r)", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
322: result = System.Text.RegularExpressions.Regex.Replace(result, "(\t)( )+(\t)", "\t\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
323: result = System.Text.RegularExpressions.Regex.Replace(result, "(\t)( )+(\r)", "\t\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
324: result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)( )+(\t)", "\r\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
325:
326: // Remove redundant tabs
327: result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)(\t)+(\r)", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
328:
329: // Remove multible tabs followind a linebreak with just one tab
330: result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)(\t)+", "\r\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
331:
332: // Initial replacement target string for linebreaks
333: string breaks = "\r\r\r";
334: // Initial replacement target string for tabs
335: string tabs = "\t\t\t\t\t";
336:
337: for (int index = 0; index < result.Length; index++)
338: {
339: result = result.Replace(breaks, "\r\r");
340: result = result.Replace(tabs, "\t\t\t\t");
341: breaks = breaks + "\r";
342: tabs = tabs + "\t";
343: }
344:
345: // Thats it.
346: return result;
347:
348: }
349: catch
350: {
351: //MessageBox.Show("Error");
352: return null;
353: }
354: }
355:
356: ////////////////////////////////////////////////////////////////////////////
357:
358: /// <summary>
359: ///
360: /// </summary>
361: public static string TextToHtml(string source)
362: {
363: // clean regular text format pages and return an equivalent html format
364:
365: string s;
366:
367: s = Decode(source);
368: //s = Ia.Cl.Model.Html.Html_Strip(s);
369: s = Regex.Replace(s, @"\.", @". ");
370: s = Regex.Replace(s, @"[ ]+", @" ");
371: s = s.Replace("\r", "");
372: s = s.Replace("\n+", "\n");
373: //s = "<p>" + s.Replace("\n", "</p>\n<p>") + "</p>";
374: /*
375: s = s.Replace("\n", "</p>\n<p>");
376:
377: // clean up
378: u = sb.ToString();
379: u = Regex.Replace(u, @"^\s+", "");
380: u = Regex.Replace(u, @">\s+", ">");
381: u = Regex.Replace(u, @"\s+<", "<");
382: u = Regex.Replace(u, @"\s+", " ");
383: u = Regex.Replace(u, @"\n+", @"<br/>"); // keep newlines
384: //u = Regex.Replace(u, @"</ul>(.+?)</ul>", "</ul><p>$1</p></ul>");
385: //u = Regex.Replace(u, @"</ul>(.+?)</p>", "</ul><p>$1</p></p>");
386: //u = u.Replace(@"�", "<p/> � ");
387: */
388:
389: return s;
390: }
391:
392: ////////////////////////////////////////////////////////////////////////////
393:
394: /// <summary>
395: ///
396: /// </summary>
397: public static string TextToHtml2(string source)
398: {
399: // clean regular text format pages and return an equivalent html format
400:
401: string s;
402:
403: s = Decode(source);
404: //s = Ia.Cl.Model.Html.Html_Strip(s);
405: s = Regex.Replace(s, @"\.", @". ");
406: s = Regex.Replace(s, @"[ ]+", @" ");
407: s = s.Replace("\r", "");
408: s = s.Replace("\n+", "\n");
409: s = "<p>" + s.Replace("\n", "</p>\n<p>") + "</p>";
410:
411: /*
412: s = s.Replace("\n", "</p>\n<p>");
413:
414: // clean up
415: u = sb.ToString();
416: u = Regex.Replace(u, @"^\s+", "");
417: u = Regex.Replace(u, @">\s+", ">");
418: u = Regex.Replace(u, @"\s+<", "<");
419: u = Regex.Replace(u, @"\s+", " ");
420: u = Regex.Replace(u, @"\n+", @"<br/>"); // keep newlines
421: //u = Regex.Replace(u, @"</ul>(.+?)</ul>", "</ul><p>$1</p></ul>");
422: //u = Regex.Replace(u, @"</ul>(.+?)</p>", "</ul><p>$1</p></p>");
423: //u = u.Replace(@"�", "<p/> � ");
424: */
425:
426: return s;
427: }
428:
429: ////////////////////////////////////////////////////////////////////////////
430:
431: /// <summary>
432: ///
433: /// </summary>
434: public static string TextToHtmlAndOl_Ul_LiToBr(string source)
435: {
436: // clean regular text format pages and return an equivalent html format
437:
438: string s;
439:
440: s = Decode(source);
441: s = Regex.Replace(s, @"\.", @". ");
442: s = Regex.Replace(s, @"[ ]+", @" ");
443: s = s.Replace("\r", "");
444: s = s.Replace("\n+", "\n");
445:
446: s = s.Replace("<ol>", "<br/> <br/>");
447: s = s.Replace("</ol>", "");
448: s = s.Replace("<ul>", "<br/> <br/>");
449: s = s.Replace("</ul>", "");
450: s = s.Replace("<li>", "-");
451: s = s.Replace("</li>", "<br/>");
452:
453: return s;
454: }
455:
456: ////////////////////////////////////////////////////////////////////////////
457:
458: /// <summary>
459: ///
460: /// <see href="http://madskristensen.net/post/remove-whitespace-from-your-pages"/>
461: /// </summary>
462: public static string RemoveWhitespaceFromHtml(string html)
463: {
464: // for now we will skip if page has <pre>
465:
466: if (!html.Contains("<pre>"))
467: {
468: html = regexBetweenTags.Replace(html, "> <");
469: html = regexLineBreaks.Replace(html, string.Empty);
470: }
471:
472: return html.Trim();
473: }
474:
475: ////////////////////////////////////////////////////////////////////////////
476: ////////////////////////////////////////////////////////////////////////////
477: }
478: }