申请会员ID: 菩提叶

域发表于 2018-5-28 15:42

1.申请 I D：菩提叶
2.个人邮箱：44255977@qq.com
3.技术文章: 70后程序猿一枚，擅长c\c++,delphi,php等，多数代码都代码都已经商业化了，不太好拿出来说事，就拿个十几年前随手写的字符串查找替换小工具来说说吧。
开发工具：delphi6（验证版编译ide是d2009）
需求：实现搜索包含指定文本的文件。（支持多字节字符）
分析：1、由于要支持多字节字符，所以首先要解决的问题是源和目标内容的编码方式必须统一，否则就谈不是比较匹配了，于是写了以下方法来判断并根据编码方式读取文件内容。
type
TTextFormat=(tfAnsi,tfUnicode,tfUnicodeBigEndian,tfUtf8); //定义4种主流编码方式，基本上也就这些了，其它的都是特殊格式，也读不出来，比如word文档

const
TextFormatFlag:array of word=($0000,$FFFE,$FEFF,$EFBB);

function WordLoHiExchange(w:Word):Word;register;
asm
XCHG AL, AH
end;

procedure ReadTextFile(const FileName: string;
var TextFormat: TTextFormat; var sText:string);
var
w:Word;
b:Byte;
begin
with TFileStream.Create(FileName,fmOpenRead or fmShareDenyNone) do
try
Read(w,2);
w:=WordLoHiExchange(w);//因为是以Word数据类型读取，故高低字节互换
if w = TextFormatFlag then
TextFormat:= tfUnicode
else if w = TextFormatFlag then
TextFormat:= tfUnicodeBigEndian
else if w = TextFormatFlag then
begin
Read(b,1);//这里要注意一下，UFT-8必须要跳过三个字节。
TextFormat:=tfUtf8;
end else
begin
TextFormat:=tfANSI;
Position:=0;
end;
SetLength(sText,Size-Position);
ReadBuffer(sText,Size-Position);
finally
Free;
end;
end;

能识别和读取了，那么接下来还得把源和待匹配的内容都转为一种编码才能开始匹配，于是又增加以下函数来实现各编码的转换
function StringToWideStringEx(const S: AnsiString; CodePage: Word): WideString;
var
L: Integer;
begin
L:= MultiByteToWideChar(CodePage, 0, PAnsiChar(S), -1, nil, 0);
SetLength(Result, L-1);
MultiByteToWideChar(CodePage, 0, PAnsiChar(S), -1, PWideChar(Result), L - 1);
end;
//---------------------------------------------------------------------
function WideStringToStringEx(const WS: WideString; CodePage: Word): String;
var
L: Integer;
begin
L := WideCharToMultiByte(CodePage, 0, PWideChar(WS), -1, nil, 0, nil, nil);
SetLength(Result, L-1);
WideCharToMultiByte(CodePage, 0, PWideChar(WS), -1, PAnsiChar(Result), L - 1, nil, nil);
end;

function Converstr(s:WideString):WideString;
var
w:Word;
c:WideChar;
i:Integer;
tmp:WideString;
begin
for i:=1 to Length(s) do
begin
c:=s;
w:=word(c);
w:=WordLoHiExchange(w);
c:=WideChar(w);
tmp:=tmp+string(c);
end;
Result:=tmp;
end;

function AnsiToUnicode(Ansi: string):string;
var
s:string;
i, slen:integer;
j:string;
begin
s:='';
sLen := length(Ansi);
for i := 1 to sLen do
begin
if i mod 2 = 1 then
   j:=IntToHex(word(ansi) shl 8,2)
else j:=IntToHex(word(ansi),2);
s:=s+j;
end;
Result :=s;
end;

function HexToInt(hex: char): integer;
begin
if (hex >='1') and (hex <= '9') then result := strToInt(hex)
else if((hex >= 'a') and (hex <= 'f')) then result := integer(hex)- integer('a') + 10
else if (hex >= 'A') and (hex <= 'F') then result := integer(hex)- integer('A') + 10
else result := 0;
end;

function UnicodeToAnsi(Unicode: string):string;
var
s:string;
i, slen, m, n:integer;
begin
s:='';
sLen := length(Unicode) div 2 ;
for i := 1 to sLen do
begin
m := HexToInt(Unicode);
n := HexToInt(Unicode);
s := s+chr(m *16 +n);
end;
Result :=s;
end;

2、内容匹配方面，最主要考虑效率问题，经过测试后决定采用以下算法，原算法出自于一个老外，我只是做了些小修改，使之能够用于匹配中文等多字节字符。
为什么要传源长度和匹配长度呢？也是为了效率，不用每次都计算下长度。为什么使用汇编，也是效率问题，原本delphi本身有提供一个pos函数，也是汇编写的，但算法上不够理想，所以使用了其它方法
function FastPos(
const aSourceString, aFindString : String;
const aSourceLen, aFindLen, StartPos : integer
) : integer;
begin
// Next, we determine how many bytes we need to
// scan to find the "start" of aFindString.
// Remove by Terry Tang
{
SourceLen := aSourceLen;
SourceLen := SourceLen - aFindLen;
if (StartPos-1) > SourceLen then begin
Result := 0;
Exit;
end;
SourceLen := SourceLen - StartPos;
SourceLen := SourceLen +2;
}
// Remove end

// The ASM starts here.
asm
// Delphi uses ESI, EDI, and EBX a lot,
// so we must preserve them.
push ESI
push EDI
push EBX

// Add by Terry Tang
Mov ECX, aSourceLen
Mov EAX, aFindLen
Sub ECX, EAX
JL@Result0
Mov EAX, StartPos
Dec EAX
Sub ECX, EAX
JL@Result0
Inc ECX
// Add end

// Get the address of sourceString
// and Add (StartPos-1).
// We do this for the purpose of finding
// the NEXT occurrence, rather than
// always the first!
mov EDI, aSourceString
add EDI, StartPos
Dec EDI
// Get the address of aFindString.
mov ESI, aFindString
// Note how many bytes we need to
// look through in aSourceString
// to find aFindString.

// Remove by Terry Tang
// mov ECX, SourceLen
// Remove end

// Get the first char of aFindString;
// note how it is done outside of the
// main loop, as it never changes!
MovAl,
// Now the FindFirstCharacter loop!
@ScaSB:
// Get the value of the current
// character in aSourceString.
// This is equal to ah := EDI^, that
// is what the [] are around .
MovAh,
// Compare this character with aDestString.
cmpAh,Al
// If they're not equal we don't
// compare the strings.
jne@NextChar
// If they're equal, obviously we do!
@CompareStrings:
// Put the length of aFindLen in EBX.
movEBX, aFindLen
// We DEC EBX to point to the end of
// the string; that is, we don't want to
// add 1 if aFindString is 1 in length!
decEBX

// add by ShengQuanhu
// If EBX is zero, then we've successfully
// compared each character; i.e. it's A MATCH!
// It will be happened when aFindLen=1
Jz @EndOfMatch
//add end

//Here’s another optimization tip. People at this point usually PUSH ESI and
//so on and then POP ESI and so forth at the end–instead, I opted not to chan
//ge ESI and so on at all. This saves lots of pushing and popping!
@CompareNext:
// Get aFindString character +
// aFindStringLength (the last char).
movAl,
// Get aSourceString character (current
// position + aFindStringLength).
movAh,
// Compare them.
cmpAl, Ah
Jz @Matches
// If they don't match, we put the first char
// of aFindString into Al again to continue
// looking for the first character.
MovAl,
Jmp@NextChar
@Matches:
// If they match, we DEC EBX (point to
// previous character to compare).
DecEBX
// If EBX <> 0 ("J"ump "N"ot "Z"ero), we
// continue comparing strings.
Jnz@CompareNext

//add by Shengquanhu
@EndOfMatch:
//add end

// If EBX is zero, then we've successfully
// compared each character; i.e. it's A MATCH!
// Move the address of the *current*
// character in EDI.
// Note, we haven't altered EDI since
// the first char was found.
movEAX, EDI
// This is an address, so subtract the
// address of aSourceString to get
// an actual character position.
subEAX, aSourceString
// Inc EAX to make it 1-based,
// rather than 0-based.
incEAX
// Put it into result.
movResult, EAX
// Finish this routine!
jmp@TheEnd
@NextChar:
//This is where I jump to when I want to continue searching for the first char
//acter of aFindString in aSearchString:
// Point EDI (aFindString) to
// the next character.
MovAh, //先把第一个字符移到Ah中，后面判断是否中文
IncEDI
// Dec ECX tells us that we've checked
// another character, and that we're
// fast running out of string to check!
decECX
// If EBX <> 0, then continue scanning
// for the first character.

//add by shengquanhu
//if ah is chinese char,jump again
jz @Result0

cmpah, $80
jb @ScaSB
IncEDI
DecECX
//add by shengquanhu end

jnz@ScaSB

//add by shengquanhu
@Result0:
//add by shengquanhu end

// If EBX = 0, then move 0 into RESULT.
movResult,0
// Restore EBX, EDI, ESI for Delphi
// to work correctly.
// Note that they're POPped in the
// opposite order they were PUSHed.
@TheEnd:
popEBX
popEDI
popESI

end;
end;
另外稍加修改可以适应不区分大小写的情况，代码就不贴出来了。

3、文件搜索，这方面就是调用findfirstfile和findnextfile来检索文件，基本上没什么技术含量，就是要注意一些细节问题，比如文件是否被占用，目标文件是否需要分段读取，由于和界面通讯还得注意下线程安全问题。核心代码如下：

procedure TSearchDirThread.SearchDir(DirName: string);
var
DSearchRec: TSearchRec;
ZSearchRec: TSearchRec;
TmpFile: TStringStream;
f1: TextFile;
f:TFileStream;
soureHX:TMemoryStream;
FindResult, Findsize: integer;
filename, str:string;
astr: AnsiString;
tmp:PWideChar;
CodeType:TTextFormat;
NoCase, founded:Boolean; //区分大小写

function IsDirNotation(ADirName: String): Boolean;
begin
Result := (ADirName = '.') or (ADirName = '..');
end;

begin
while onoff do
begin
exit ;
end;
DirName := GetDirectoryName(DirName);
FindResult := FindFirst(DirName+ftp,faAnyFile,DSearchRec);
try
while FindResult = 0 do
begin
   filename:=Trim(DirName+DSearchRec.Name);
   if ((DSearchRec.Attr and faDirectory) <> faDirectory) and (Pos('hiberfil.sys',filename)=0) then
   begin
   EnterCriticalSection(CS);
   Frm_main.stat1.Panels.Text:=filename;
   if FileStatus(filename) then    //先判断文件是否使用中
   begin
      if not Rpl then
      begin
         ReadTextFile(filename,CodeType,str);
         if CodeType=tfAnsi then
         begin
         f:=TFileStream.Create(filename,fmOpenRead or fmShareDenyNone);
         SetLength(astr,f.size+1);
         f.ReadBuffer(astr,f.Size);
         f.Free;
         str:=astr;
   //    TmpFile:=TStringStream.Create('',TEncoding.ASCII);
   //    TmpFile.WriteBuffer(str,Length(str));
   //    astr:=TmpFile.DataString;
         end
         else
         if CodeType=tfUtf8 then
         begin
         f:=TFileStream.Create(filename,fmOpenRead or fmShareDenyNone);
         SetLength(astr,f.size-3);
         f.Position:=3;
         f.ReadBuffer(astr,f.Size-3);
         f.Free;
         str:=UTF8Decode(astr);
         end
         else if (CodeType=tfUnicode) or (CodeType=tfUnicodeBigEndian) then
         str:=GetFileText(filename);

         if CodeType=tfUnicodeBigEndian then
         begin
            FindStr:=ConverStr(Frm_main.FindString.Text);
            RepStr:=ConverStr(Frm_main.rpstring.Text);
         end
         else begin
            FindStr:=Frm_main.FindString.Text;
            RepStr:=Frm_main.rpstring.Text;
         end;
      end
      else begin
         f:=TFileStream.Create(filename,fmOpenRead or fmShareDenyNone);
         SetLength(str,f.size+1);
         f.ReadBuffer(str,f.Size);
         f.Free;
         FindStr:=Frm_main.FindString.Text;
         RepStr:=Frm_main.rpstring.Text;
      end;

      Findsize:=Length(str);
      NoCase:= not Frm_main.chk1.Checked;

      if not Rpl then
      begin
         if NoCase then
         begin
         if FastPosNoCase(str,FindStr,Findsize,Length(FindStr),1)<> 0 then
            founded:=True
         else
            founded:=False;
         end
         else begin
         if FastPos(str,FindStr,Findsize,Length(FindStr),1)<> 0 then
            founded:=True
         else
            founded:=False;
         end;
         if founded then
         begin
         Inc(Fcut);
         Frm_Main.lv1.Items.Add.Caption:=LowerCase(DSearchRec.Name);
         Frm_Main.lv1.Items.Item.SubItems.Add(DirName);
         end;
      end
      else begin
         if NoCase then
         begin
         if FastPosNoCase(str,FindStr,Findsize,Length(FindStr),1)<> 0 then
            founded:=True
         else
            founded:=False;
         end
         else begin
         if FastPos(str,FindStr,Findsize,Length(FindStr),1)<> 0 then
            founded:=True
         else
            founded:=False;
         end;
         if founded then
         begin
         Inc(Fcut);

         Frm_Main.lv1.Items.Add.Caption:=LowerCase(DSearchRec.Name);
         Frm_Main.lv1.Items.Item.SubItems.Add(DirName);
         str:=FastReplace(str,FindStr,RepStr,False);
         soureHX:=TMemoryStream.Create;
         { if CodeType=tfUtf8 then
            str:=UTF8Encode(widestring(str))
         else if (CodeType=tfUnicode) then
            str:=AnsiToUnicode(str)
         else if (CodeType=tfUnicodeBigEndian) then
            str:=AnsiToUnicode(str); }

         soureHX.Write(str,Length(str));
         soureHX.SaveToFile(filename);
         soureHX.Free;
         end;
      end;
   end;
   LeaveCriticalSection(CS);
   end;
   FindResult := FindNext(DSearchRec);
end;
finally
FindClose(DSearchRec);
end;
FindResult := FindFirst(DirName+'*.*', faDirectory+faHidden, zSearchRec);
try
while FindResult = 0 do
begin
if ((ZSearchRec.Attr and faDirectory) = faDirectory) and not
   IsDirNotation(zSearchRec.Name) then
   Searchdir(DirName+zSearchRec.Name);
FindResult := FindNext(zSearchRec);
end;
finally
FindClose(zSearchRec);
end;
end;

4，最后就是界面问题，一个好的界面并不一定多华丽，但要让用户用起来没难度，所以就简单易用的原则搞了下，做了一些体验上的优化，如记录最近搜索过的路径，关键字自动完成等。
实际上目前还能baidu搜索到该小工具。这个小工具最大的特点就是快，有兴趣的话可以对比下同类工具，我估计在同类工具里应该是最快的（多年前我测试过，利益于高效的匹配算法，效率方面确实是最高的，十几年过去了，也不知道新的产品有没有改进）。
此次为了验证，特别重新编译了一个，在界面上增加了一个验证按钮，如图。
https://www.52pojie.cn/forum.php?mod=image&aid=1143897&size=300x300&key=a504db5d4afa0be8&nocache=yes&type=贴图错误，请阅读“贴图帮助”。

成品下载地址: https://pan.baidu.com/s/1KE1Me66wpumfJffrVnq1CQ

Hmily 发表于 2018-5-30 17:02

还是有些普通，请补充更多技术分析吧。

danao 发表于 2018-10-20 11:33

看着好厉害

页: [1]

吾爱破解 - 52pojie.cn's Archiver

申请会员ID: 菩提叶