申请会员ID: 菩提叶

吾爱游客 *发表于 2018-5-28 15:42* · 发表于 2018-5-28 15:42

1.申请 I D：菩提叶
2.个人邮箱：44255977@qq.com
3.技术文章: 70后程序猿一枚，擅长c\c++,delphi,php等，多数代码都代码都已经商业化了，不太好拿出来说事，就拿个十几年前随手写的字符串查找替换小工具来说说吧。
开发工具：delphi6（验证版编译ide是d2009）
需求：实现搜索包含指定文本的文件。（支持多字节字符）
分析：1、由于要支持多字节字符，所以首先要解决的问题是源和目标内容的编码方式必须统一，否则就谈不是比较匹配了，于是写了以下方法来判断并根据编码方式读取文件内容。
type
TTextFormat=(tfAnsi,tfUnicode,tfUnicodeBigEndian,tfUtf8); //定义4种主流编码方式，基本上也就这些了，其它的都是特殊格式，也读不出来，比如word文档

const
TextFormatFlag:array[tfAnsi..tfUtf8] of word=($0000,$FFFE,$FEFF,$EFBB);

function WordLoHiExchange(w:Word):Word;register;
asm
XCHG AL, AH
end;

procedure ReadTextFile(const FileName: string;
var TextFormat: TTextFormat; var sText:string);
var
w:Word;
b:Byte;
begin
with TFileStream.Create(FileName,fmOpenRead or fmShareDenyNone) do
try
Read(w,2);
w:=WordLoHiExchange(w);//因为是以Word数据类型读取，故高低字节互换
if w = TextFormatFlag[tfUnicode] then
   TextFormat:= tfUnicode
else if w = TextFormatFlag[tfUnicodeBigEndian] then
   TextFormat:= tfUnicodeBigEndian
else if w = TextFormatFlag[tfUtf8] then
begin
   Read(b,1);//这里要注意一下，UFT-8必须要跳过三个字节。
   TextFormat:=tfUtf8;
end else
begin
   TextFormat:=tfANSI;
   Position:=0;
end;
SetLength(sText,Size-Position);
ReadBuffer(sText[1],Size-Position);
finally
Free;
end;
end;

能识别和读取了，那么接下来还得把源和待匹配的内容都转为一种编码才能开始匹配，于是又增加以下函数来实现各编码的转换
function StringToWideStringEx(const S: AnsiString; CodePage: Word): WideString;
var
  L: Integer;
begin
  L:= MultiByteToWideChar(CodePage, 0, PAnsiChar(S), -1, nil, 0);
  SetLength(Result, L-1);
  MultiByteToWideChar(CodePage, 0, PAnsiChar(S), -1, PWideChar(Result), L - 1);
end;
//---------------------------------------------------------------------
function WideStringToStringEx(const WS: WideString; CodePage: Word): String;
var
  L: Integer;
begin
  L := WideCharToMultiByte(CodePage, 0, PWideChar(WS), -1, nil, 0, nil, nil);
  SetLength(Result, L-1);
  WideCharToMultiByte(CodePage, 0, PWideChar(WS), -1, PAnsiChar(Result), L - 1, nil, nil);
end;

function Converstr(s:WideString):WideString;
var
  w:Word;
  c:WideChar;
  i:Integer;
  tmp:WideString;
begin
  for i:=1 to Length(s) do
  begin
c:=s;
w:=word(c);
w:=WordLoHiExchange(w);
c:=WideChar(w);
tmp:=tmp+string(c);
  end;
  Result:=tmp;
end;

function AnsiToUnicode(Ansi: string):string;
var
  s:string;
  i, slen:integer;
  j:string[2];
begin
  s:='';
  sLen := length(Ansi);
  for i := 1 to sLen do
  begin
if i mod 2 = 1 then
   j:=IntToHex(word(ansi) shl 8,2)
else j:=IntToHex(word(ansi),2);
s:=s+j;
  end;
  Result :=s;
end;

function HexToInt(hex: char): integer;
begin
  if (hex >='1') and (hex <= '9') then result := strToInt(hex)
  else if  ((hex >= 'a') and (hex <= 'f')) then result := integer(hex)- integer('a') + 10
  else if (hex >= 'A') and (hex <= 'F') then result := integer(hex)- integer('A') + 10
  else result := 0;
end;

function UnicodeToAnsi(Unicode: string):string;
var
  s:string;
  i, slen, m, n:integer;
begin
  s:='';
  sLen := length(Unicode) div 2 ;
  for i := 1 to sLen do
  begin
m := HexToInt(Unicode[i*2-1]);
n := HexToInt(Unicode[i*2]);
s := s+chr(m *16 +n);
  end;
  Result :=s;
end;

2、内容匹配方面，最主要考虑效率问题，经过测试后决定采用以下算法，原算法出自于一个老外，我只是做了些小修改，使之能够用于匹配中文等多字节字符。
为什么要传源长度和匹配长度呢？也是为了效率，不用每次都计算下长度。为什么使用汇编，也是效率问题，原本delphi本身有提供一个pos函数，也是汇编写的，但算法上不够理想，所以使用了其它方法
function FastPos(
  const aSourceString, aFindString : String;
  const aSourceLen, aFindLen, StartPos : integer
  ) : integer;
begin
  // Next, we determine how many bytes we need to
  // scan to find the "start" of aFindString.
// Remove by Terry Tang
{
  SourceLen := aSourceLen;
  SourceLen := SourceLen - aFindLen;
  if (StartPos-1) > SourceLen then begin
Result := 0;
Exit;
  end;
  SourceLen := SourceLen - StartPos;
  SourceLen := SourceLen +2;
}
// Remove end

  // The ASM starts here.
  asm
// Delphi uses ESI, EDI, and EBX a lot,
// so we must preserve them.
push ESI
push EDI
push EBX

// Add by Terry Tang
Mov ECX, aSourceLen
Mov EAX, aFindLen
Sub ECX, EAX
JL  @Result0
Mov EAX, StartPos
Dec EAX
Sub ECX, EAX
JL  @Result0
Inc ECX
// Add end

// Get the address of sourceString[1]
// and Add (StartPos-1).
// We do this for the purpose of finding
// the NEXT occurrence, rather than
// always the first!
mov EDI, aSourceString
add EDI, StartPos
Dec EDI
// Get the address of aFindString.
mov ESI, aFindString
// Note how many bytes we need to
// look through in aSourceString
// to find aFindString.

// Remove by Terry Tang
// mov ECX, SourceLen
// Remove end

// Get the first char of aFindString;
// note how it is done outside of the
// main loop, as it never changes!
Mov  Al, [ESI]
// Now the FindFirstCharacter loop!
@ScaSB:
// Get the value of the current
// character in aSourceString.
// This is equal to ah := EDI^, that
// is what the [] are around [EDI].
Mov  Ah, [EDI]
// Compare this character with aDestString[1].
cmp  Ah,Al
// If they're not equal we don't
// compare the strings.
jne  @NextChar
// If they're equal, obviously we do!
@CompareStrings:
// Put the length of aFindLen in EBX.
mov  EBX, aFindLen
// We DEC EBX to point to the end of
// the string; that is, we don't want to
// add 1 if aFindString is 1 in length!
dec  EBX

// add by ShengQuanhu
// If EBX is zero, then we've successfully
// compared each character; i.e. it's A MATCH!
// It will be happened when aFindLen=1
Jz @EndOfMatch
//add end

//Here’s another optimization tip. People at this point usually PUSH ESI and
//so on and then POP ESI and so forth at the end–instead, I opted not to chan
//ge ESI and so on at all. This saves lots of pushing and popping!
@CompareNext:
// Get aFindString character +
// aFindStringLength (the last char).
mov  Al, [ESI+EBX]
// Get aSourceString character (current
// position + aFindStringLength).
mov  Ah, [EDI+EBX]
// Compare them.
cmp  Al, Ah
Jz @Matches
// If they don't match, we put the first char
// of aFindString into Al again to continue
// looking for the first character.
Mov  Al, [ESI]
Jmp  @NextChar
@Matches:
// If they match, we DEC EBX (point to
// previous character to compare).
Dec  EBX
// If EBX <> 0 ("J"ump "N"ot "Z"ero), we
// continue comparing strings.
Jnz  @CompareNext

//add by Shengquanhu
@EndOfMatch:
//add end

// If EBX is zero, then we've successfully
// compared each character; i.e. it's A MATCH!
// Move the address of the *current*
// character in EDI.
// Note, we haven't altered EDI since
// the first char was found.
mov  EAX, EDI
// This is an address, so subtract the
// address of aSourceString[1] to get
// an actual character position.
sub  EAX, aSourceString
// Inc EAX to make it 1-based,
// rather than 0-based.
inc  EAX
// Put it into result.
mov  Result, EAX
// Finish this routine!
jmp  @TheEnd
@NextChar:
//This is where I jump to when I want to continue searching for the first char
//acter of aFindString in aSearchString:
// Point EDI (aFindString[X]) to
// the next character.
Mov  Ah, [EDI]//先把第一个字符移到Ah中，后面判断是否中文
Inc  EDI
// Dec ECX tells us that we've checked
// another character, and that we're
// fast running out of string to check!
dec  ECX
// If EBX <> 0, then continue scanning
// for the first character.

//add by shengquanhu
//if ah is chinese char,jump again
jz @Result0

cmp  ah, $80
jb @ScaSB
Inc  EDI
Dec  ECX
//add by shengquanhu end

jnz  @ScaSB

//add by shengquanhu
@Result0:
//add by shengquanhu end

// If EBX = 0, then move 0 into RESULT.
mov  Result,0
// Restore EBX, EDI, ESI for Delphi
// to work correctly.
// Note that they're POPped in the
// opposite order they were PUSHed.
@TheEnd:
pop  EBX
pop  EDI
pop  ESI

  end;
end;
另外稍加修改可以适应不区分大小写的情况，代码就不贴出来了。

3、文件搜索，这方面就是调用findfirstfile和findnextfile来检索文件，基本上没什么技术含量，就是要注意一些细节问题，比如文件是否被占用，目标文件是否需要分段读取，由于和界面通讯还得注意下线程安全问题。核心代码如下：

procedure TSearchDirThread.SearchDir(DirName: string);
var
  DSearchRec: TSearchRec;
  ZSearchRec: TSearchRec;
  TmpFile: TStringStream;
  f1: TextFile;
  f:TFileStream;
  soureHX:TMemoryStream;
  FindResult, Findsize: integer;
  filename, str:string;
  astr: AnsiString;
  tmp:PWideChar;
  CodeType:TTextFormat;
  NoCase, founded:Boolean; //区分大小写

  function IsDirNotation(ADirName: String): Boolean;
  begin
Result := (ADirName = '.') or (ADirName = '..');
  end;

begin
  while onoff do
  begin
exit ;
  end;
  DirName := GetDirectoryName(DirName);
  FindResult := FindFirst(DirName+ftp,faAnyFile,DSearchRec);
  try
while FindResult = 0 do
begin
   filename:=Trim(DirName+DSearchRec.Name);
   if ((DSearchRec.Attr and faDirectory) <> faDirectory) and (Pos('hiberfil.sys',filename)=0) then
   begin
      EnterCriticalSection(CS);
      Frm_main.stat1.Panels[1].Text:=filename;
      if FileStatus(filename) then       //先判断文件是否使用中
      begin
      if not Rpl then
      begin
         ReadTextFile(filename,CodeType,str);
         if CodeType=tfAnsi then
         begin
            f:=TFileStream.Create(filename,fmOpenRead or fmShareDenyNone);
            SetLength(astr,f.size+1);
            f.ReadBuffer(astr[1],f.Size);
            f.Free;
            str:=astr;
      //    TmpFile:=TStringStream.Create('',TEncoding.ASCII);
      //    TmpFile.WriteBuffer(str[1],Length(str));
      //    astr:=TmpFile.DataString;
         end
         else
         if CodeType=tfUtf8 then
         begin
            f:=TFileStream.Create(filename,fmOpenRead or fmShareDenyNone);
            SetLength(astr,f.size-3);
            f.Position:=3;
            f.ReadBuffer(astr[1],f.Size-3);
            f.Free;
            str:=UTF8Decode(astr);
         end
         else if (CodeType=tfUnicode) or (CodeType=tfUnicodeBigEndian) then
            str:=GetFileText(filename);

         if CodeType=tfUnicodeBigEndian then
         begin
            FindStr:=ConverStr(Frm_main.FindString.Text);
            RepStr:=ConverStr(Frm_main.rpstring.Text);
         end
         else begin
            FindStr:=Frm_main.FindString.Text;
            RepStr:=Frm_main.rpstring.Text;
         end;
      end
      else begin
         f:=TFileStream.Create(filename,fmOpenRead or fmShareDenyNone);
         SetLength(str,f.size+1);
         f.ReadBuffer(str[1],f.Size);
         f.Free;
         FindStr:=Frm_main.FindString.Text;
         RepStr:=Frm_main.rpstring.Text;
      end;

      Findsize:=Length(str);
      NoCase:= not Frm_main.chk1.Checked;

      if not Rpl then
      begin
         if NoCase then
         begin
            if FastPosNoCase(str,FindStr,Findsize,Length(FindStr),1)<> 0 then
            founded:=True
            else
            founded:=False;
         end
         else begin
            if FastPos(str,FindStr,Findsize,Length(FindStr),1)<> 0 then
            founded:=True
            else
            founded:=False;
         end;
         if founded then
         begin
            Inc(Fcut);
            Frm_Main.lv1.Items.Add.Caption:=LowerCase(DSearchRec.Name);
            Frm_Main.lv1.Items.Item[Frm_Main.lv1.Items.Count - 1].SubItems.Add(DirName);
         end;
      end
      else begin
         if NoCase then
         begin
            if FastPosNoCase(str,FindStr,Findsize,Length(FindStr),1)<> 0 then
            founded:=True
            else
            founded:=False;
         end
         else begin
            if FastPos(str,FindStr,Findsize,Length(FindStr),1)<> 0 then
            founded:=True
            else
            founded:=False;
         end;
         if founded then
         begin
            Inc(Fcut);

            Frm_Main.lv1.Items.Add.Caption:=LowerCase(DSearchRec.Name);
            Frm_Main.lv1.Items.Item[Frm_Main.lv1.Items.Count - 1].SubItems.Add(DirName);
            str:=FastReplace(str,FindStr,RepStr,False);
            soureHX:=TMemoryStream.Create;
         { if CodeType=tfUtf8 then
            str:=UTF8Encode(widestring(str))
            else if (CodeType=tfUnicode) then
            str:=AnsiToUnicode(str)
            else if (CodeType=tfUnicodeBigEndian) then
            str:=AnsiToUnicode(str);    }

            soureHX.Write(str[1],Length(str));
            soureHX.SaveToFile(filename);
            soureHX.Free;
         end;
      end;
      end;
      LeaveCriticalSection(CS);
   end;
   FindResult := FindNext(DSearchRec);
end;
  finally
FindClose(DSearchRec);
  end;
  FindResult := FindFirst(DirName+'*.*', faDirectory+faHidden, zSearchRec);
try
  while FindResult = 0 do
  begin
if ((ZSearchRec.Attr and faDirectory) = faDirectory) and not
      IsDirNotation(zSearchRec.Name) then
   Searchdir(DirName+zSearchRec.Name);
FindResult := FindNext(zSearchRec);
  end;
finally
  FindClose(zSearchRec);
end;
end;

4，最后就是界面问题，一个好的界面并不一定多华丽，但要让用户用起来没难度，所以就简单易用的原则搞了下，做了一些体验上的优化，如记录最近搜索过的路径，关键字自动完成等。
实际上目前还能baidu搜索到该小工具。这个小工具最大的特点就是快，有兴趣的话可以对比下同类工具，我估计在同类工具里应该是最快的（多年前我测试过，利益于高效的匹配算法，效率方面确实是最高的，十几年过去了，也不知道新的产品有没有改进）。
此次为了验证，特别重新编译了一个，在界面上增加了一个验证按钮，如图。
[img=110,0]https://www.52pojie.cn/forum.php?mod=image&aid=1143897&size=300x300&key=a504db5d4afa0be8&nocache=yes&type=贴图错误，请阅读“贴图帮助”。

成品下载地址: https://pan.baidu.com/s/1KE1Me66wpumfJffrVnq1CQ

Hmily · 发表于 2018-5-30 17:02

还是有些普通，请补充更多技术分析吧。

danao · 发表于 2018-10-20 11:33

看着好厉害

帐号		自动登录	找回密码
密码			注册[Register]

选中篇:

[会员申请] 申请会员ID: 菩提叶


	选中篇: 置顶\|

选中 篇:

[会员申请] 申请会员ID: 菩提叶

选中篇: