< 返回新闻公共列表
云南大王-web系统安全运营之基础
发布时间:2020-04-16 00:00:00
【概述】做好一个web系统的安全运维,除了常规的防注入,防入侵等,还有一个检测并过滤敏感词,脏词.. 这件事做得不好,轻则导致一场投诉或纠纷,重则导致产品被勒令关闭停运。
废话少说,先看下代码,可以拿过去直接使用。
1 using Microsoft.VisualBasic;
2 using System;
3 using System.Collections.Generic;
4 using System.IO;
5 using System.Linq;
6 using System.Text;
7
8 namespace OpenCore.ContentSecurity
9 {
10 ///
11 /// 功能简介:基于DFA算法的高效率非法关键词检测过滤类(杜绝违法内容)
12 /// 开发前参考内容:https://blog.csdn.net/u011966339/article/details/72832197
13 /// 更新日志:
14 /// 2020-4-15:加载字典的处理采用静态构造方法中处理,避免频繁加载,提升性能.
15 /// 支持多词库文件加载.
16 /// 优化了算法的细节,提高健壮性。
17 ///
18 public class SensitiveWordFilter
19 {
20 private static string[] dictionaryPathList = null;
21 ///
22 /// 内存词典
23 ///
24 private static WordGroup[] MEMORYLEXICON = new WordGroup[(int)char.MaxValue];
25 private static object lockObj = new object();
26 public static void Init(string[] sDictionaryFileName)
27 {
28 dictionaryPathList = sDictionaryFileName;
29 LoadDictionary();
30 }
31 public SensitiveWordFilter()
32 {
33
34 }
35 private string sourctText = string.Empty;
36 ///
37 /// 检测源
38 ///
39 private string SourctText
40 {
41 get { return sourctText; }
42 set { sourctText = value; }
43 }
44 ///
45 /// 检测源游标
46 ///
47 private int cursor = 0;
48 ///
49 /// 匹配成功后偏移量
50 ///
51 private int wordlenght = 0;
52 ///
53 /// 检测词游标
54 ///
55 private int nextCursor = 0;
56 private List
illegalWords = new List();
57 ///
58 /// 检测到的非法词集
59 ///
60 public List IllegalWords
61 {
62 get { return illegalWords; }
63 }
64 ///
65 /// 判断是否是中文
66 ///
67 ///
68 ///
69 private bool isCHS(char character)
70 {
71 // 中文表意字符的范围 4E00-9FA5
72 int charVal = (int)character;
73 return (charVal >= 0x4e00 && charVal <= 0x9fa5);
74 }
75 ///
76 /// 判断是否是数字
77 ///
78 ///
79 ///
80 private bool isNum(char character)
81 {
82 int charVal = (int)character;
83 return (charVal >= 48 && charVal <= 57);
84 }
85 ///
86 /// 判断是否是字母
87 ///
88 ///
89 ///
90 private bool isAlphabet(char character)
91 {
92 int charVal = (int)character;
93 return ((charVal >= 97 && charVal <= 122) || (charVal >= 65 && charVal <= 90));
94 }
95 ///
96 /// 转半角小写的函数(DBC case)
97 ///
98 /// 任意字符串
99 /// 半角字符串
100 ///
101 ///全角空格为12288,半角空格为32
102 ///其他字符半角(33-126)与全角(65281-65374)的对应关系是:均相差65248
103 ///
104 private static string ToDBC(string input)
105 {
106 char[] c = input.ToCharArray();
107 for (int i = 0; i < c.Length; i++)
108 {
109 if (c[i] == 12288)
110 {
111 c[i] = (char)32;
112 continue;
113 }
114 if (c[i] > 65280 && c[i] < 65375)
115 c[i] = (char)(c[i] - 65248);
116 }
117 return new string(c).ToLower();
118 }
119 ///
120 /// 转换为简体中文
121 ///
122 ///
123 ///
124 private static string ToSimplifiedChiniese(string sInput)
125 {
126 if (string.IsNullOrEmpty(sInput))
127 {
128 return string.Empty;
129 }
130 try
131 {
132 return Strings.StrConv(sInput, VbStrConv.SimplifiedChinese, 0);
133 }
134 catch (Exception ex)
135 {
136
137 }
138 return sInput;
139 }
140 ///
141 /// 写入日志(非跨程序域的场景)
142 ///
143 ///
144 private static void SaveLog(string Msg)
145 {
146 string sPath = Path.Combine(AppDomain.CurrentDomain.SetupInformation.ApplicationBase, "SecurityLog");
147 if (!Directory.Exists(sPath))
148 {
149 Directory.CreateDirectory(sPath);
150 }
151 sPath = string.Format("{0}\\{1}", sPath, DateTime.Now.ToString("yyyyMMdd") + ".log");
152 try
153 {
154 File.AppendAllText(sPath, "[" + DateTime.Now.ToString() + "]" + Msg + "\r\n");
155 }
156 catch
157 {
158 }
159 }
160 ///
161 /// 加载内存词库
162 ///
163 private static void LoadDictionary()
164 {
165 if (dictionaryPathList == null || dictionaryPathList.Length == 0)
166 {
167 SaveLog($"SensitiveWordFilter.LoadDictionary.字典路径配置为空");
168 return;
169 }
170 foreach (string sFileName in dictionaryPathList)
171 {
172 if (File.Exists(sFileName) == false)
173 {
174 SaveLog($"SensitiveWordFilter.LoadDictionary.路径:{sFileName}不是一个有效的文件");
175 return;
176 }
177 }
178 List wordList = new List();
179 Array.Clear(MEMORYLEXICON, 0, MEMORYLEXICON.Length);
180 foreach (string sDictionaryFile in dictionaryPathList)
181 {
182 string[] words = System.IO.File.ReadAllLines(sDictionaryFile, System.Text.Encoding.Default);
183 foreach (string word in words)
184 {
185 if (string.IsNullOrEmpty(word))
186 continue;
187 if (word.Trim().Length == 0)
188 continue;
189 string key = ToDBC(word);
190 wordList.Add(key);
191 //适配繁体,简体.addbyww@2020-4-15
192 string key_simple = ToSimplifiedChiniese(key);
193 if (key_simple != key)
194 {
195 wordList.Add(key_simple);
196 }
197 }
198 }
199 Comparison cmp = delegate (string key1, string key2)
200 {
201 return key1.CompareTo(key2);
202 };
203 wordList.Sort(cmp);
204 for (int i = wordList.Count - 1; i > 0; i--)
205 {
206 if (wordList[i].ToString() == wordList[i - 1].ToString())
207 {
208 wordList.RemoveAt(i);
209 }
210 }
211 foreach (var word in wordList)
212 {
213 if (word.Length > 0)
214 {
215 WordGroup group = MEMORYLEXICON[(int)word[0]];
216 if (group == null)
217 {
218 group = new WordGroup();
219 MEMORYLEXICON[(int)word[0]] = group;
220 }
221 group.Add(word.Substring(1));
222 }
223 }
224 }
225 ///
226 /// 检测
227 ///
228 ///
229 ///
230 private bool Check(string blackWord)
231 {
232 wordlenght = 0;
233 //检测源下一位游标
234 nextCursor = cursor + 1;
235 bool found = false;
236 //遍历词的每一位做匹配
237 for (int i = 0; i < blackWord.Length; i++)
238 {
239 //特殊字符偏移游标
240 int offset = 0;
241 if (nextCursor >= sourctText.Length)
242 {
243 break;
244 }
245 else
246 {
247 //检测下位字符如果不是汉字 数字 字符 偏移量加1
248 for (int y = nextCursor; y < sourctText.Length; y++)
249 {
250
251 if (!isCHS(sourctText[y]) && !isNum(sourctText[y]) && !isAlphabet(sourctText[y]))
252 {
253 offset++;
254 //避让特殊字符,下位游标如果>=字符串长度 跳出
255 if (nextCursor + offset >= sourctText.Length) break;
256 wordlenght++;
257 }
258 else break;
259 }
260 if ((int)blackWord[i] == (int)sourctText[nextCursor + offset])
261 {
262 found = true;
263 }
264 else
265 {
266 found = false;
267 break;
268 }
269 }
270 nextCursor = nextCursor + 1 + offset;
271 wordlenght++;
272 }
273 return found;
274 }
275 ///
276 /// 检测并替换敏感词为指定字符。之后返回
277 ///
278 /// 比如:*
279 public string getDataByFilter(string sSourceInput, char replaceChar)
280 {
281 if (string.IsNullOrEmpty(sSourceInput))
282 {
283 return sSourceInput;
284 }
285 if (MEMORYLEXICON == null || MEMORYLEXICON.Length == 0)
286 {
287 SaveLog($"SensitiveWordFilter.getDataByFilter.内存字典为空");
288 return sSourceInput;
289 }
290 //初始化
291 this.cursor = 0;
292 this.wordlenght = 0;
293 this.illegalWords.Clear();
294 this.sourctText = sSourceInput;
295 if (sourctText != string.Empty)
296 {
297 char[] tempString = sourctText.ToCharArray();
298 for (int i = 0; i < SourctText.Length; i++)
299 {
300 //查询以该字为首字符的词组
301 WordGroup group = MEMORYLEXICON[(int)ToDBC(SourctText)[i]];
302 if (group != null)
303 {
304 for (int z = 0; z < group.Count(); z++)
305 {
306 string word = group.GetWord(z);
307 if (word.Length == 0 || Check(word))
308 {
309 string blackword = string.Empty;
310 for (int pos = 0; pos < wordlenght + 1; pos++)
311 {
312 blackword += tempString[pos + cursor].ToString();
313 tempString[pos + cursor] = replaceChar;
314 }
315 illegalWords.Add(blackword);
316 cursor = cursor + wordlenght;
317 i = i + wordlenght;
318 }
319 }
320 }
321 cursor++;
322 }
323 return new string(tempString);
324 }
325 else
326 {
327 return string.Empty;
328 }
329 }
330 }
331 ///
332 /// 具有相同首字符的词组集合
333 ///
334 public class WordGroup
335 {
336 ///
337 /// 集合
338 ///
339 private List groupList=new List();
340 public WordGroup()
341 {
342
343 }
344 ///
345 /// 添加词
346 ///
347 ///
348 public void Add(string word)
349 {
350 if (groupList.Contains(word) == false)
351 {
352 groupList.Add(word);
353 }
354 }
355 ///
356 /// 获取总数
357 ///
358 ///
359 public int Count()
360 {
361 return groupList.Count;
362 }
363 ///
364 /// 根据下标获取词
365 ///
366 ///
367 ///
368 public string GetWord(int index)
369 {
370 return groupList[index];
371 }
372 }
373 }
上面是一个完整的,独立的实现类。 下面给一个简单的调用示例:
1 //全局配置,整个程序只要配置一次即可,后续无需配置
2 SensitiveWordFilter.Init(new string[] {
3 @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\暴恐词库.txt",
4 @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\反动词库.txt",
5 @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\民生词库.txt",
6 @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\色情词库.txt",
7 @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\贪腐词库.txt",
8 @"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\其他词库.txt"
9 });
10 //下列可以在多个地方实例化,可以并发执行
11 SensitiveWordFilter wordFilter = new SensitiveWordFilter();
12 Dictionary dictTestData = new Dictionary();
13 //多测几个示例,看看效果
14 dictTestData["杀^人游戏,有人找一夜q"] = "";
15 dictTestData["数学学习课堂"] = "";
16 dictTestData["打击法0功有,法0功毒害大众"] = "";
17 Dictionary dictResult = new Dictionary();
18 foreach(string sKey in dictTestData.Keys)
19 {
20 dictResult[sKey] = $"替换后:{wordFilter.getDataByFilter(sKey,'|')}, ------------检测违禁词:{string.Join(",",(wordFilter.IllegalWords==null?new List():wordFilter.IllegalWords))}";
21 }
22 string sResultJson = JsonConverter.SerializeObject(dictResult);
23 Utils.SaveLog(sResultJson);
最后,给一下打印的结果:
"杀^人游戏,有人找一夜q": 替换后: "杀^人游戏,有人找|||", ------------检测违禁词:一夜q", "数学学习课堂": 替换后:"数学学习课堂", ------------检测违禁词:, "打击法0功有,法0功毒害大众": 替换后:"打击|||有,|||毒害大众", ------------检测违禁词:法0功,法0功"
-------------附
词库下载地址:https://codeload.github.com/chason777777/mgck/zip/master