1.申 请 I D:菩提叶
2.个人邮箱:44255977@qq.com
3.技术文章: 70后程序猿一枚,擅长c\c++,delphi,php等,多数代码都代码都已经商业化了,不太好拿出来说事,就拿个十几年前随手写的字符串查找替换小工具来说说吧。
开发工具:delphi6(验证版编译ide是d2009)
需求:实现搜索包含指定文本的文件。(支持多字节字符)
分析:1、由于要支持多字节字符,所以首先要解决的问题是源和目标内容的编码方式必须统一,否则就谈不是比较匹配了,于是写了以下方法来判断并根据编码方式读取文件内容。
type
TTextFormat=(tfAnsi,tfUnicode,tfUnicodeBigEndian,tfUtf8); //定义4种主流编码方式,基本上也就这些了,其它的都是特殊格式,也读不出来,比如word文档
const
TextFormatFlag:array[tfAnsi..tfUtf8] of word=($0000,$FFFE,$FEFF,$EFBB);
function WordLoHiExchange(w:Word):Word;register;
asm
XCHG AL, AH
end;
procedure ReadTextFile(const FileName: string;
var TextFormat: TTextFormat; var sText:string);
var
w:Word;
b:Byte;
begin
with TFileStream.Create(FileName,fmOpenRead or fmShareDenyNone) do
try
Read(w,2);
w:=WordLoHiExchange(w);//因为是以Word数据类型读取,故高低字节互换
if w = TextFormatFlag[tfUnicode] then
TextFormat:= tfUnicode
else if w = TextFormatFlag[tfUnicodeBigEndian] then
TextFormat:= tfUnicodeBigEndian
else if w = TextFormatFlag[tfUtf8] then
begin
Read(b,1);//这里要注意一下,UFT-8必须要跳过三个字节。
TextFormat:=tfUtf8;
end else
begin
TextFormat:=tfANSI;
Position:=0;
end;
SetLength(sText,Size-Position);
ReadBuffer(sText[1],Size-Position);
finally
Free;
end;
end;
能识别和读取了,那么接下来还得把源和待匹配的内容都转为一种编码才能开始匹配,于是又增加以下函数来实现各编码的转换
function StringToWideStringEx(const S: AnsiString; CodePage: Word): WideString;
var
L: Integer;
begin
L:= MultiByteToWideChar(CodePage, 0, PAnsiChar(S), -1, nil, 0);
SetLength(Result, L-1);
MultiByteToWideChar(CodePage, 0, PAnsiChar(S), -1, PWideChar(Result), L - 1);
end;
//---------------------------------------------------------------------
function WideStringToStringEx(const WS: WideString; CodePage: Word): String;
var
L: Integer;
begin
L := WideCharToMultiByte(CodePage, 0, PWideChar(WS), -1, nil, 0, nil, nil);
SetLength(Result, L-1);
WideCharToMultiByte(CodePage, 0, PWideChar(WS), -1, PAnsiChar(Result), L - 1, nil, nil);
end;
function Converstr(s:WideString):WideString;
var
w:Word;
c:WideChar;
i:Integer;
tmp:WideString;
begin
for i:=1 to Length(s) do
begin
c:=s;
w:=word(c);
w:=WordLoHiExchange(w);
c:=WideChar(w);
tmp:=tmp+string(c);
end;
Result:=tmp;
end;
function AnsiToUnicode(Ansi: string):string;
var
s:string;
i, slen:integer;
j:string[2];
begin
s:='';
sLen := length(Ansi);
for i := 1 to sLen do
begin
if i mod 2 = 1 then
j:=IntToHex(word(ansi) shl 8,2)
else j:=IntToHex(word(ansi),2);
s:=s+j;
end;
Result :=s;
end;
function HexToInt(hex: char): integer;
begin
if (hex >='1') and (hex <= '9') then result := strToInt(hex)
else if ((hex >= 'a') and (hex <= 'f')) then result := integer(hex)- integer('a') + 10
else if (hex >= 'A') and (hex <= 'F') then result := integer(hex)- integer('A') + 10
else result := 0;
end;
function UnicodeToAnsi(Unicode: string):string;
var
s:string;
i, slen, m, n:integer;
begin
s:='';
sLen := length(Unicode) div 2 ;
for i := 1 to sLen do
begin
m := HexToInt(Unicode[i*2-1]);
n := HexToInt(Unicode[i*2]);
s := s+chr(m *16 +n);
end;
Result :=s;
end;
2、内容匹配方面,最主要考虑效率问题,经过测试后决定采用以下算法,原算法出自于一个老外,我只是做了些小修改,使之能够用于匹配中文等多字节字符。
为什么要传源长度和匹配长度呢?也是为了效率,不用每次都计算下长度。为什么使用汇编,也是效率问题,原本delphi本身有提供一个pos函数,也是汇编写的,但算法上不够理想,所以使用了其它方法
function FastPos(
const aSourceString, aFindString : String;
const aSourceLen, aFindLen, StartPos : integer
) : integer;
begin
// Next, we determine how many bytes we need to
// scan to find the "start" of aFindString.
// Remove by Terry Tang
{
SourceLen := aSourceLen;
SourceLen := SourceLen - aFindLen;
if (StartPos-1) > SourceLen then begin
Result := 0;
Exit;
end;
SourceLen := SourceLen - StartPos;
SourceLen := SourceLen +2;
}
// Remove end
// The ASM starts here.
asm
// Delphi uses ESI, EDI, and EBX a lot,
// so we must preserve them.
push ESI
push EDI
push EBX
// Add by Terry Tang
Mov ECX, aSourceLen
Mov EAX, aFindLen
Sub ECX, EAX
JL @Result0
Mov EAX, StartPos
Dec EAX
Sub ECX, EAX
JL @Result0
Inc ECX
// Add end
// Get the address of sourceString[1]
// and Add (StartPos-1).
// We do this for the purpose of finding
// the NEXT occurrence, rather than
// always the first!
mov EDI, aSourceString
add EDI, StartPos
Dec EDI
// Get the address of aFindString.
mov ESI, aFindString
// Note how many bytes we need to
// look through in aSourceString
// to find aFindString.
// Remove by Terry Tang
// mov ECX, SourceLen
// Remove end
// Get the first char of aFindString;
// note how it is done outside of the
// main loop, as it never changes!
Mov Al, [ESI]
// Now the FindFirstCharacter loop!
@ScaSB:
// Get the value of the current
// character in aSourceString.
// This is equal to ah := EDI^, that
// is what the [] are around [EDI].
Mov Ah, [EDI]
// Compare this character with aDestString[1].
cmp Ah,Al
// If they're not equal we don't
// compare the strings.
jne @NextChar
// If they're equal, obviously we do!
@CompareStrings:
// Put the length of aFindLen in EBX.
mov EBX, aFindLen
// We DEC EBX to point to the end of
// the string; that is, we don't want to
// add 1 if aFindString is 1 in length!
dec EBX
// add by ShengQuanhu
// If EBX is zero, then we've successfully
// compared each character; i.e. it's A MATCH!
// It will be happened when aFindLen=1
Jz @EndOfMatch
//add end
//Here’s another optimization tip. People at this point usually PUSH ESI and
//so on and then POP ESI and so forth at the end–instead, I opted not to chan
//ge ESI and so on at all. This saves lots of pushing and popping!
@CompareNext:
// Get aFindString character +
// aFindStringLength (the last char).
mov Al, [ESI+EBX]
// Get aSourceString character (current
// position + aFindStringLength).
mov Ah, [EDI+EBX]
// Compare them.
cmp Al, Ah
Jz @Matches
// If they don't match, we put the first char
// of aFindString into Al again to continue
// looking for the first character.
Mov Al, [ESI]
Jmp @NextChar
@Matches:
// If they match, we DEC EBX (point to
// previous character to compare).
Dec EBX
// If EBX <> 0 ("J"ump "N"ot "Z"ero), we
// continue comparing strings.
Jnz @CompareNext
//add by Shengquanhu
@EndOfMatch:
//add end
// If EBX is zero, then we've successfully
// compared each character; i.e. it's A MATCH!
// Move the address of the *current*
// character in EDI.
// Note, we haven't altered EDI since
// the first char was found.
mov EAX, EDI
// This is an address, so subtract the
// address of aSourceString[1] to get
// an actual character position.
sub EAX, aSourceString
// Inc EAX to make it 1-based,
// rather than 0-based.
inc EAX
// Put it into result.
mov Result, EAX
// Finish this routine!
jmp @TheEnd
@NextChar:
//This is where I jump to when I want to continue searching for the first char
//acter of aFindString in aSearchString:
// Point EDI (aFindString[X]) to
// the next character.
Mov Ah, [EDI]//先把第一个字符移到Ah中,后面判断是否中文
Inc EDI
// Dec ECX tells us that we've checked
// another character, and that we're
// fast running out of string to check!
dec ECX
// If EBX <> 0, then continue scanning
// for the first character.
//add by shengquanhu
//if ah is chinese char,jump again
jz @Result0
cmp ah, $80
jb @ScaSB
Inc EDI
Dec ECX
//add by shengquanhu end
jnz @ScaSB
//add by shengquanhu
@Result0:
//add by shengquanhu end
// If EBX = 0, then move 0 into RESULT.
mov Result,0
// Restore EBX, EDI, ESI for Delphi
// to work correctly.
// Note that they're POPped in the
// opposite order they were PUSHed.
@TheEnd:
pop EBX
pop EDI
pop ESI
end;
end;
另外稍加修改可以适应不区分大小写的情况,代码就不贴出来了。
3、文件搜索,这方面就是调用findfirstfile和findnextfile来检索文件,基本上没什么技术含量,就是要注意一些细节问题,比如文件是否被占用,目标文件是否需要分段读取,由于和界面通讯还得注意下线程安全问题。核心代码如下:
procedure TSearchDirThread.SearchDir(DirName: string);
var
DSearchRec: TSearchRec;
ZSearchRec: TSearchRec;
TmpFile: TStringStream;
f1: TextFile;
f:TFileStream;
soureHX:TMemoryStream;
FindResult, Findsize: integer;
filename, str:string;
astr: AnsiString;
tmp:PWideChar;
CodeType:TTextFormat;
NoCase, founded:Boolean; //区分大小写
function IsDirNotation(ADirName: String): Boolean;
begin
Result := (ADirName = '.') or (ADirName = '..');
end;
begin
while onoff do
begin
exit ;
end;
DirName := GetDirectoryName(DirName);
FindResult := FindFirst(DirName+ftp,faAnyFile,DSearchRec);
try
while FindResult = 0 do
begin
filename:=Trim(DirName+DSearchRec.Name);
if ((DSearchRec.Attr and faDirectory) <> faDirectory) and (Pos('hiberfil.sys',filename)=0) then
begin
EnterCriticalSection(CS);
Frm_main.stat1.Panels[1].Text:=filename;
if FileStatus(filename) then //先判断文件是否使用中
begin
if not Rpl then
begin
ReadTextFile(filename,CodeType,str);
if CodeType=tfAnsi then
begin
f:=TFileStream.Create(filename,fmOpenRead or fmShareDenyNone);
SetLength(astr,f.size+1);
f.ReadBuffer(astr[1],f.Size);
f.Free;
str:=astr;
// TmpFile:=TStringStream.Create('',TEncoding.ASCII);
// TmpFile.WriteBuffer(str[1],Length(str));
// astr:=TmpFile.DataString;
end
else
if CodeType=tfUtf8 then
begin
f:=TFileStream.Create(filename,fmOpenRead or fmShareDenyNone);
SetLength(astr,f.size-3);
f.Position:=3;
f.ReadBuffer(astr[1],f.Size-3);
f.Free;
str:=UTF8Decode(astr);
end
else if (CodeType=tfUnicode) or (CodeType=tfUnicodeBigEndian) then
str:=GetFileText(filename);
if CodeType=tfUnicodeBigEndian then
begin
FindStr:=ConverStr(Frm_main.FindString.Text);
RepStr:=ConverStr(Frm_main.rpstring.Text);
end
else begin
FindStr:=Frm_main.FindString.Text;
RepStr:=Frm_main.rpstring.Text;
end;
end
else begin
f:=TFileStream.Create(filename,fmOpenRead or fmShareDenyNone);
SetLength(str,f.size+1);
f.ReadBuffer(str[1],f.Size);
f.Free;
FindStr:=Frm_main.FindString.Text;
RepStr:=Frm_main.rpstring.Text;
end;
Findsize:=Length(str);
NoCase:= not Frm_main.chk1.Checked;
if not Rpl then
begin
if NoCase then
begin
if FastPosNoCase(str,FindStr,Findsize,Length(FindStr),1)<> 0 then
founded:=True
else
founded:=False;
end
else begin
if FastPos(str,FindStr,Findsize,Length(FindStr),1)<> 0 then
founded:=True
else
founded:=False;
end;
if founded then
begin
Inc(Fcut);
Frm_Main.lv1.Items.Add.Caption:=LowerCase(DSearchRec.Name);
Frm_Main.lv1.Items.Item[Frm_Main.lv1.Items.Count - 1].SubItems.Add(DirName);
end;
end
else begin
if NoCase then
begin
if FastPosNoCase(str,FindStr,Findsize,Length(FindStr),1)<> 0 then
founded:=True
else
founded:=False;
end
else begin
if FastPos(str,FindStr,Findsize,Length(FindStr),1)<> 0 then
founded:=True
else
founded:=False;
end;
if founded then
begin
Inc(Fcut);
Frm_Main.lv1.Items.Add.Caption:=LowerCase(DSearchRec.Name);
Frm_Main.lv1.Items.Item[Frm_Main.lv1.Items.Count - 1].SubItems.Add(DirName);
str:=FastReplace(str,FindStr,RepStr,False);
soureHX:=TMemoryStream.Create;
{ if CodeType=tfUtf8 then
str:=UTF8Encode(widestring(str))
else if (CodeType=tfUnicode) then
str:=AnsiToUnicode(str)
else if (CodeType=tfUnicodeBigEndian) then
str:=AnsiToUnicode(str); }
soureHX.Write(str[1],Length(str));
soureHX.SaveToFile(filename);
soureHX.Free;
end;
end;
end;
LeaveCriticalSection(CS);
end;
FindResult := FindNext(DSearchRec);
end;
finally
FindClose(DSearchRec);
end;
FindResult := FindFirst(DirName+'*.*', faDirectory+faHidden, zSearchRec);
try
while FindResult = 0 do
begin
if ((ZSearchRec.Attr and faDirectory) = faDirectory) and not
IsDirNotation(zSearchRec.Name) then
Searchdir(DirName+zSearchRec.Name);
FindResult := FindNext(zSearchRec);
end;
finally
FindClose(zSearchRec);
end;
end;
4,最后就是界面问题,一个好的界面并不一定多华丽,但要让用户用起来没难度,所以就简单易用的原则搞了下,做了一些体验上的优化,如记录最近搜索过的路径,关键字自动完成等。
实际上目前还能baidu搜索到该小工具。这个小工具最大的特点就是快,有兴趣的话可以对比下同类工具,我估计在同类工具里应该是最快的(多年前我测试过,利益于高效的匹配算法,效率方面确实是最高的,十几年过去了,也不知道新的产品有没有改进)。
此次为了验证,特别重新编译了一个,在界面上增加了一个验证按钮,如图。
[img=110,0]https://www.52pojie.cn/forum.php?mod=image&aid=1143897&size=300x300&key=a504db5d4afa0be8&nocache=yes&type=贴图错误,请阅读“贴图帮助”。
成品下载地址: https://pan.baidu.com/s/1KE1Me66wpumfJffrVnq1CQ
|