Skip to content

Commit

Permalink
2021.09的老代码,后续再细看
Browse files Browse the repository at this point in the history
  • Loading branch information
a232319779 committed Nov 11, 2022
1 parent e5ba10f commit 9e0fc76
Show file tree
Hide file tree
Showing 8 changed files with 137 additions and 37 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ __pycache__/
*$py.class

datas/
myconfigs/

# C extensions
*.so
Expand Down
15 changes: 15 additions & 0 deletions appspider/configs/postgreconfig.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# -*- coding: utf-8 -*-
# @Time : 2022/04/29 16:42:45
# @Author : ddvv
# @Site : https://ddvvmmzz.github.io
# @File : postgreconfig.py
# @Software : Visual Studio Code
# @WeChat : NextB

db_config = {
'address': '',
'port': 0,
'username': '',
'password': '',
'db_name': ''
}
15 changes: 9 additions & 6 deletions appspider/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,21 @@

from sqlalchemy import Column, String
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.sql.sqltypes import INTEGER
from sqlalchemy.sql.sqltypes import DateTime, BIGINT

Base = declarative_base()

class TelegramMessage(Base):
__tablename__ = 'nextb_telegram_messages'

id = Column(INTEGER(), primary_key=True, unique=True, autoincrement=True)
message_id = Column(INTEGER())
chat_id = Column(INTEGER())
user_id = Column(INTEGER())
id = Column(BIGINT(), primary_key=True, unique=True, autoincrement=True)
message_id = Column(BIGINT())
chat_id = Column(BIGINT())
user_id = Column(BIGINT())
user_name = Column(String(255))
nick_name = Column(String(255))
postal_time = Column(String(255))
postal_time = Column(DateTime)
reply_to_msg_id = Column(BIGINT())
from_name = Column(String(255))
from_time = Column(DateTime)
message = Column(String(5096))
42 changes: 24 additions & 18 deletions appspider/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,18 @@ def open_spider(self, spider):
def process_item(self, item, spider):
if len(self.datas) >= self.push_number:
for data in self.datas:
new_messaeg = TelegramMessage()
new_messaeg.message_id = data.get('message_id', -1)
new_messaeg.chat_id = data.get('chat_id', -1)
new_messaeg.user_id = data.get('user_id', -1)
new_messaeg.user_name = data.get('user_name', '')
new_messaeg.nick_name = data.get('nick_name', '')
new_messaeg.postal_time = data.get('postal_time', '')
new_messaeg.message = data.get('message', '')
self.session_maker.add(new_messaeg)
new_message = TelegramMessage()
new_message.message_id = data.get('message_id', -1)
new_message.chat_id = data.get('chat_id', -1)
new_message.user_id = data.get('user_id', -1)
new_message.user_name = data.get('user_name', '')
new_message.nick_name = data.get('nick_name', '')
new_message.postal_time = data.get('postal_time')
new_message.reply_to_msg_id = data.get('reply_to_msg_id', -1)
new_message.from_name = data.get('from_name', '')
new_message.from_time = data.get('from_time')
new_message.message = data.get('message', '')
self.session_maker.add(new_message)
self.session_maker.commit()
self.datas = []
else:
Expand All @@ -54,15 +57,18 @@ def process_item(self, item, spider):
def close_spider(self, spider):
if len(self.datas) > 0:
for data in self.datas:
new_messaeg = TelegramMessage()
new_messaeg.message_id = data.get('message_id', -1)
new_messaeg.chat_id = data.get('chat_id', -1)
new_messaeg.user_id = data.get('user_id', -1)
new_messaeg.user_name = data.get('user_name', '')
new_messaeg.nick_name = data.get('nick_name', '')
new_messaeg.postal_time = data.get('postal_time', '')
new_messaeg.message = data.get('message', '')
self.session_maker.add(new_messaeg)
new_message = TelegramMessage()
new_message.message_id = data.get('message_id', -1)
new_message.chat_id = data.get('chat_id', -1)
new_message.user_id = data.get('user_id', -1)
new_message.user_name = data.get('user_name', '')
new_message.nick_name = data.get('nick_name', '')
new_message.postal_time = data.get('postal_time')
new_message.reply_to_msg_id = data.get('reply_to_msg_id', -1)
new_message.from_name = data.get('from_name', '')
new_message.from_time = data.get('from_time')
new_message.message = data.get('message', '')
self.session_maker.add(new_message)
self.session_maker.commit()
self.datas = []
self.session_maker.close_all()
Expand Down
36 changes: 34 additions & 2 deletions appspider/spiders/telegramspider/telegramAPIs.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,29 @@ def join_conversation(self, invite):

return result_json

def delete_all_dialog(self):
for dialog in self.client.get_dialogs():
# 退出频道或群组
if hasattr(dialog.entity, 'title'):
chat = dialog.entity
self.client.delete_dialog(chat)
print('已离开<{}>群组'.format(dialog.entity.title))
# 删除delete account
elif dialog.name == '':
chat = dialog.entity
self.client.delete_dialog(chat)
print('已删除Deleted Account用户对话框')
elif ' ' in dialog.name:
chat = dialog.entity
self.client.delete_dialog(chat)
print('已删除{}用户对话框'.format(dialog.name))
else:
chat = dialog.entity
self.client.delete_dialog(chat)
print('已删除{}用户对话框'.format(dialog.name))



def get_me(self):
myself = self.client.get_me()
return myself
Expand Down Expand Up @@ -218,9 +241,12 @@ def scan_message(self, chat, **kwargs):
continue
m = dict()
m['message_id'] = message.id
m['user_id'] = -1
m['user_id'] = 0
m['user_name'] = ''
m['nick_name'] = ''
m['reply_to_msg_id'] = 0
m['from_name'] = ''
m['from_time'] = datetime.datetime.fromtimestamp(657224281)
if message.sender:
m['user_id'] = message.sender.id
username = message.sender.username
Expand All @@ -235,8 +261,14 @@ def scan_message(self, chat, **kwargs):
first_name = first_name if first_name else ''
last_name = ' '+ last_name if last_name else ''
m['nick_name'] = '{0}{1}'.format(first_name, last_name)
if message.is_reply:
m['reply_to_msg_id'] = message.reply_to_msg_id
if message.forward:
m['from_name'] = message.forward.from_name
m['from_time'] = message.forward.date
m['chat_id'] = chat.id
m['postal_time'] = message.date.strftime('%Y-%m-%d %H:%M:%S')
# m['postal_time'] = message.date.strftime('%Y-%m-%d %H:%M:%S')
m['postal_time'] = message.date
m['message'] = content
tick += 1
if tick >= waterline:
Expand Down
2 changes: 1 addition & 1 deletion appspider/spiders/telegramspider/telegramScanMessages.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def scan_messages(self):
limit = self.group['limit']
offset_date = self.group['offset_date']
offset_date = offset_date if offset_date else None
chat = telegram_app.get_dialog(int(group_telegram_id), is_more=False)
chat = telegram_app.get_dialog(group_telegram_id, is_more=False)
if chat:
param = {
'limit': limit,
Expand Down
11 changes: 11 additions & 0 deletions configs/test_nextb.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"api_id": "",
"api_hash": "",
"session_name": "",
"group": {
"group_id": 788095144,
"limit": 10,
"last_message_id": -1,
"offset_date": ""
}
}
52 changes: 42 additions & 10 deletions run_spiders.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, scoped_session

from appspider.items import TelegramMessage
from appspider.spiders.telegramspider.telegramAPIs import TelegramAPIs
from appspider.items import TelegramMessage, Base
from appspider.configs.postgreconfig import db_config


Expand Down Expand Up @@ -43,6 +44,10 @@ def create_session(self):
if self.session_maker is None:
self.session_maker = scoped_session(sessionmaker(autoflush=True, autocommit=False,
bind=self.engine))

# 创建表
def create_table(self):
Base.metadata.create_all(self.engine)

def search_message(self, chat_id):
data = self.session_maker.query(TelegramMessage).filter(TelegramMessage.chat_id == chat_id).order_by(TelegramMessage.id.desc()).limit(1)
Expand All @@ -53,18 +58,45 @@ def search_message(self, chat_id):


def main():
# 创建表
# nb = NextBTGDB()
# nb.create_table()
# 登录tg账户,生成登录文件
with open(sys.argv[1], 'r') as f:
data = f.read()
config_js = json.loads(data)
nb = NextBTGDB()
chat_id = config_js.get('group', {}).get('group_id')
message_data = nb.search_message(chat_id=chat_id)
if message_data:
config_js['group']['last_message_id'] = message_data.message_id
param_base64 = base64.b64encode(json.dumps(config_js).encode()).decode()
name = 'telegramScanMessages'
cmd = 'scrapy crawl {name} -L INFO -a param={param_base64}'.format(name=name, param_base64=param_base64)
cmdline.execute(cmd.split())
ta = TelegramAPIs()
ta.init_client(session_name=config_js.get('session_name'), api_id=config_js.get('api_id'), api_hash=config_js.get('api_hash'))
# 删除所有聊天对话框
# ta.delete_all_dialog()
# 获取对话框信息
dialogs = list()
for dialog in ta.get_dialog_list():
dialogs.append(dialog)
print(json.dumps(dialogs, ensure_ascii=False))
# 测试爬取消息
# group = config_js.get('group')
# chat = ta.get_dialog(group.get('group_id'), is_more=False)
# param = {
# 'limit': group.get('limit'),
# 'offset_date': None,
# 'last_message_id': group.get('last_message_id')
# }
# for data in ta.scan_message(chat, **param):
# print(data)
# ta.close_client()
# with open(sys.argv[1], 'r') as f:
# data = f.read()
# config_js = json.loads(data)
# nb = NextBTGDB()
# chat_id = config_js.get('group', {}).get('group_id')
# message_data = nb.search_message(chat_id=chat_id)
# if message_data:
# config_js['group']['last_message_id'] = message_data.message_id
# param_base64 = base64.b64encode(json.dumps(config_js).encode()).decode()
# name = 'telegramScanMessages'
# cmd = 'scrapy crawl {name} -L INFO -a param={param_base64}'.format(name=name, param_base64=param_base64)
# cmdline.execute(cmd.split())

if __name__ == '__main__':
main()

0 comments on commit 9e0fc76

Please sign in to comment.